diff options
Diffstat (limited to 'net')
61 files changed, 884 insertions, 1782 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 2c9a17b9b46b..357214a51f13 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -181,6 +181,12 @@ static int parse_opts(char *opts, struct p9_client *clnt) ret = r; continue; } + if (option < 4096) { + p9_debug(P9_DEBUG_ERROR, + "msize should be at least 4k\n"); + ret = -EINVAL; + continue; + } clnt->msize = option; break; case Opt_trans: @@ -983,10 +989,18 @@ static int p9_client_version(struct p9_client *c) else if (!strncmp(version, "9P2000", 6)) c->proto_version = p9_proto_legacy; else { + p9_debug(P9_DEBUG_ERROR, + "server returned an unknown version: %s\n", version); err = -EREMOTEIO; goto error; } + if (msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "server returned a msize < 4096: %d\n", msize); + err = -EREMOTEIO; + goto error; + } if (msize < c->msize) c->msize = msize; @@ -1043,6 +1057,13 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (clnt->msize > clnt->trans_mod->maxsize) clnt->msize = clnt->trans_mod->maxsize; + if (clnt->msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "Please specify a msize of at least 4k\n"); + err = -EINVAL; + goto free_client; + } + err = p9_client_version(clnt); if (err) goto close_trans; diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index b718db2085b2..3dff68f05fb9 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/module.h> +#include "trans_common.h" /** * p9_release_pages - Release pages after the transaction. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c603d33d5410..5d01edf8d819 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -653,15 +653,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } - dev = dev_get_by_name(&init_net, devname); + rtnl_lock(); + dev = __dev_get_by_name(&init_net, devname); if (!dev) { + rtnl_unlock(); res = -ENODEV; break; } ax25->ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25->ax25_dev) { + rtnl_unlock(); + res = -ENODEV; + break; + } ax25_fillin_cb(ax25, ax25->ax25_dev); - dev_put(dev); + rtnl_unlock(); break; default: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 9a3a301e1e2f..d92195cd7834 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -116,6 +116,7 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; dev_put(dev); kfree(ax25_dev); return; @@ -125,6 +126,7 @@ void ax25_dev_device_down(struct net_device *dev) if (s->next == ax25_dev) { s->next = ax25_dev->next; spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; dev_put(dev); kfree(ax25_dev); return; diff --git a/net/compat.c b/net/compat.c index f7084780a8f8..c3a2f868e8af 100644 --- a/net/compat.c +++ b/net/compat.c @@ -467,12 +467,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) ctv = (struct compat_timeval __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); + if (tv.tv_sec == -1) return err; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } err = 0; if (put_user(tv.tv_sec, &ctv->tv_sec) || @@ -494,12 +496,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta ctv = (struct compat_timespec __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return err; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - ts = ktime_to_timespec(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec(kt); } err = 0; if (put_user(ts.tv_sec, &ctv->tv_sec) || diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d05402868575..158264f7cfaf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -793,8 +793,13 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) - info.regdump_len = ops->get_regs_len(dev); + if (ops->get_regs_len) { + int ret = ops->get_regs_len(dev); + + if (ret > 0) + info.regdump_len = ret; + } + if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); @@ -1337,6 +1342,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) return -EFAULT; reglen = ops->get_regs_len(dev); + if (reglen <= 0) + return reglen; + if (regs.len > reglen) regs.len = reglen; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 48f61885fd6f..5ea1bed08ede 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4104,6 +4104,11 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (!addr) { + NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); + return -EINVAL; + } + if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { diff --git a/net/core/sock.c b/net/core/sock.c index f00902c532cc..6aa2e7e0b4fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2751,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL @@ -2850,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); if (tv.tv_sec == -1) return -ENOENT; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } @@ -2866,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) struct timespec ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); ts = ktime_to_timespec(sk->sk_stamp); } return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f8eb78d042a4..cfec3af54c8d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -198,11 +198,15 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) static struct fib_table *fib_empty_table(struct net *net) { - u32 id; + u32 id = 1; - for (id = 1; id <= RT_TABLE_MAX; id++) + while (1) { if (!fib_get_table(net, id)) return fib_new_table(net, id); + + if (id++ == RT_TABLE_MAX) + break; + } return NULL; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c7a7bd58a23c..d1d09f3e5f9e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -676,6 +676,9 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -719,6 +722,9 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { erspan_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -762,6 +768,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 284a22154b4e..c4f5602308ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,7 +627,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); - unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -637,14 +636,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; - /* ensure we can access the inner net header, for several users below */ - if (skb->protocol == htons(ETH_P_IP)) - inner_nhdr_len = sizeof(struct iphdr); - else if (skb->protocol == htons(ETH_P_IPV6)) - inner_nhdr_len = sizeof(struct ipv6hdr); - if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) - goto tx_error; - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b302d69c..d7b43e700023 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -241,6 +241,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { @@ -253,15 +256,18 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); break; default: - dev->stats.tx_errors++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; + goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e471f1cf9..8eeec6eb2bd3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4736,8 +4736,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (!idev) + return -ENOBUFS; if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f0cd291034f0..0bfb6cc0a30a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -350,6 +350,9 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EINVAL; goto out_unlock; } + } + + if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ae3786132c23..6613d8dbb0e5 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -627,7 +627,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } - res = fib6_dump_table(tb, skb, cb); + if (!cb->args[0]) { + res = fib6_dump_table(tb, skb, cb); + if (!res) + cb->args[0] = 1; + } goto out; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 229e55c99021..09d0826742f8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -881,6 +881,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -923,6 +926,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int nhoff; int thoff; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -995,8 +1001,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1004,7 +1008,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, &dsfield, &encap_limit); break; case htons(ETH_P_IPV6): - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr)) goto tx_err; if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 99179b9c8384..0c6403cf8b52 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1243,10 +1243,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - /* ensure we can access the full inner ip header */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return -1; - iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1321,9 +1317,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - return -1; - ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || @@ -1405,6 +1398,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 706fe42e4928..8b6eefff2f7e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -522,18 +522,18 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h; struct flowi fl; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): - ipv6h = ipv6_hdr(skb); - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - vti6_addr_conflict(t, ipv6h)) + vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; xfrm_decode_session(skb, &fl, AF_INET6); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8276f1224f16..30337b38274b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -51,6 +51,7 @@ #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> +#include <net/ip_tunnels.h> #include <linux/nospec.h> @@ -599,13 +600,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; - int err; - err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (!pskb_inet_may_pull(skb)) + goto tx_err; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto tx_err; read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; @@ -614,6 +614,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a5bb59ee50ac..36a3d8dc61f5 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -210,7 +210,7 @@ found: if (next && next->ip_defrag_offset < end) goto discard_fq; - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + /* Note : skb->ip_defrag_offset and skb->sk share the same location */ dev = skb->dev; if (dev) fq->iif = dev->ifindex; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 194bc162866d..40b225f87d5e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -210,7 +210,9 @@ struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dev); + + n = neigh_create(&nd_tbl, daddr, dev); + return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, @@ -5054,12 +5056,16 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, { struct net *net; int delay; + int ret; if (!write) return -EINVAL; net = (struct net *)ctl->extra1; delay = net->ipv6.sysctl.flush_delay; - proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (ret) + return ret; + fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 51c9f75f34b9..1e03305c0549 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1021,6 +1021,9 @@ tx_error: static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 9cd180bda092..7554c56b2e63 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -33,12 +33,6 @@ #define CONNCOUNT_SLOTS 256U -#ifdef CONFIG_LOCKDEP -#define CONNCOUNT_LOCK_SLOTS 8U -#else -#define CONNCOUNT_LOCK_SLOTS 256U -#endif - #define CONNCOUNT_GC_MAX_NODES 8 #define MAX_KEYLEN 5 @@ -49,8 +43,6 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; - bool dead; - struct rcu_head rcu_head; }; struct nf_conncount_rb { @@ -60,7 +52,7 @@ struct nf_conncount_rb { struct rcu_head rcu_head; }; -static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; +static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; struct nf_conncount_data { unsigned int keylen; @@ -89,79 +81,25 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -enum nf_conncount_list_add -nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - struct nf_conncount_tuple *conn; - - if (WARN_ON_ONCE(list->count > INT_MAX)) - return NF_CONNCOUNT_ERR; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return NF_CONNCOUNT_ERR; - - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - conn->dead = false; - spin_lock_bh(&list->list_lock); - if (list->dead == true) { - kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_SKIP; - } - list_add_tail(&conn->node, &list->head); - list->count++; - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_ADDED; -} -EXPORT_SYMBOL_GPL(nf_conncount_add); - -static void __conn_free(struct rcu_head *h) -{ - struct nf_conncount_tuple *conn; - - conn = container_of(h, struct nf_conncount_tuple, rcu_head); - kmem_cache_free(conncount_conn_cachep, conn); -} - -static bool conn_free(struct nf_conncount_list *list, +static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - bool free_entry = false; - - spin_lock_bh(&list->list_lock); - - if (conn->dead) { - spin_unlock_bh(&list->list_lock); - return free_entry; - } + lockdep_assert_held(&list->list_lock); list->count--; - conn->dead = true; - list_del_rcu(&conn->node); - if (list->count == 0) { - list->dead = true; - free_entry = true; - } + list_del(&conn->node); - spin_unlock_bh(&list->list_lock); - call_rcu(&conn->rcu_head, __conn_free); - return free_entry; + kmem_cache_free(conncount_conn_cachep, conn); } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn, bool *free_entry) + struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); - __s32 age; + u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) @@ -176,52 +114,45 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, */ age = a - b; if (conn->cpu == cpu || age >= 2) { - *free_entry = conn_free(list, conn); + conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } -void nf_conncount_lookup(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +static int __nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; - bool free_entry = false; - - /* best effort only */ - *addit = tuple ? true : false; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (!tuple) - continue; - if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - *addit = false; - } else if (PTR_ERR(found) == -ENOENT) + return 0; /* already exists */ + } else { collect++; + } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); - if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -229,7 +160,8 @@ void nf_conncount_lookup(struct net *net, * * Attempt to avoid a re-add in this case. */ - *addit = false; + nf_ct_put(found_ct); + return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -243,19 +175,48 @@ void nf_conncount_lookup(struct net *net, nf_ct_put(found_ct); } + + if (WARN_ON_ONCE(list->count > INT_MAX)) + return -EOVERFLOW; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return -ENOMEM; + + conn->tuple = *tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + list_add_tail(&conn->node, &list->head); + list->count++; + return 0; } -EXPORT_SYMBOL_GPL(nf_conncount_lookup); + +int nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + int ret; + + /* check the saved connections */ + spin_lock_bh(&list->list_lock); + ret = __nf_conncount_add(net, list, tuple, zone); + spin_unlock_bh(&list->list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; - list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -/* Return true if the list is empty */ +/* Return true if the list is empty. Must be called with BH disabled. */ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { @@ -263,17 +224,17 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; - bool free_entry = false; bool ret = false; + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock(&list->list_lock)) + return false; + list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) { - if (free_entry) - return true; + if (PTR_ERR(found) == -ENOENT) collected++; - } continue; } @@ -284,23 +245,19 @@ bool nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - if (conn_free(list, conn)) - return true; + conn_free(list, conn); collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) - return false; + break; } - spin_lock_bh(&list->list_lock); - if (!list->count) { - list->dead = true; + if (!list->count) ret = true; - } - spin_unlock_bh(&list->list_lock); + spin_unlock(&list->list_lock); return ret; } @@ -314,6 +271,7 @@ static void __tree_nodes_free(struct rcu_head *h) kmem_cache_free(conncount_rb_cachep, rbconn); } +/* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -323,8 +281,10 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); + if (!rbconn->list.count) { + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } spin_unlock(&rbconn->list.list_lock); } } @@ -341,20 +301,19 @@ insert_tree(struct net *net, struct rb_root *root, unsigned int hash, const u32 *key, - u8 keylen, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; - bool node_found = false; - - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + u8 keylen = data->keylen; + bool do_gc = true; + spin_lock_bh(&nf_conncount_locks[hash]); +restart: parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { @@ -368,45 +327,32 @@ insert_tree(struct net *net, } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { - /* unlikely: other cpu added node already */ - node_found = true; - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { + int ret; + + ret = nf_conncount_add(net, &rbconn->list, tuple, zone); + if (ret) count = 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { + else count = rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ - node_found = false; - } - break; + tree_nodes_free(root, gc_nodes, gc_count); + goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; - if (nf_conncount_gc_list(net, &rbconn->list)) + if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - - if (gc_count >= ARRAY_SIZE(gc_nodes)) - schedule_gc_worker(data, hash); + schedule_gc_worker(data, hash); + gc_count = 0; + do_gc = false; + goto restart; } - if (node_found) - goto out_unlock; - /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -430,7 +376,7 @@ insert_tree(struct net *net, rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + spin_unlock_bh(&nf_conncount_locks[hash]); return count; } @@ -441,7 +387,6 @@ count_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; @@ -454,7 +399,6 @@ count_tree(struct net *net, parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; - bool addit; rbconn = rb_entry(parent, struct nf_conncount_rb, node); @@ -464,31 +408,36 @@ count_tree(struct net *net, } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { - /* same source network -> be counted! */ - nf_conncount_lookup(net, &rbconn->list, tuple, zone, - &addit); + int ret; - if (!addit) + if (!tuple) { + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { - return 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { - return rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ + spin_lock_bh(&rbconn->list.list_lock); + /* Node might be about to be free'd. + * We need to defer to insert_tree() in this case. + */ + if (rbconn->list.count == 0) { + spin_unlock_bh(&rbconn->list.list_lock); break; } + + /* same source network -> be counted! */ + ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + spin_unlock_bh(&rbconn->list.list_lock); + if (ret) + return 0; /* hotdrop */ + else + return rbconn->list.count; } } if (!tuple) return 0; - return insert_tree(net, data, root, hash, key, keylen, tuple, zone); + return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) @@ -499,27 +448,47 @@ static void tree_gc_worker(struct work_struct *work) struct rb_node *node; unsigned int tree, next_tree, gc_count = 0; - tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; + tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; + local_bh_disable(); rcu_read_lock(); for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); if (nf_conncount_gc_list(data->net, &rbconn->list)) - gc_nodes[gc_count++] = rbconn; + gc_count++; } rcu_read_unlock(); + local_bh_enable(); + + cond_resched(); spin_lock_bh(&nf_conncount_locks[tree]); + if (gc_count < ARRAY_SIZE(gc_nodes)) + goto next; /* do not bother */ - if (gc_count) { - tree_nodes_free(root, gc_nodes, gc_count); + gc_count = 0; + node = rb_first(root); + while (node != NULL) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + node = rb_next(node); + + if (rbconn->list.count > 0) + continue; + + gc_nodes[gc_count++] = rbconn; + if (gc_count >= ARRAY_SIZE(gc_nodes)) { + tree_nodes_free(root, gc_nodes, gc_count); + gc_count = 0; + } } + tree_nodes_free(root, gc_nodes, gc_count); +next: clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; - next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); + next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); if (next_tree < CONNCOUNT_SLOTS) { data->gc_tree = next_tree; @@ -621,10 +590,7 @@ static int __init nf_conncount_modinit(void) { int i; - BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS); - BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0); - - for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i) + for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fec814dace5a..2b0a93300dd7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5727,6 +5727,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); + if (!nest) + goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) goto nla_put_failure; diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index b90d96ba4a12..af1497ab9464 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int count; - bool addit; tuple_ptr = &tuple; @@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, - &addit); - count = priv->list.count; - - if (!addit) - goto out; - - if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { + if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; return; } - count++; -out: + + count = priv->list.count; if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 03f37c4e64fe..1d3144d19903 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -153,7 +153,7 @@ static struct sock *nr_find_listener(ax25_address *addr) sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { - bh_lock_sock(s); + sock_hold(s); goto found; } s = NULL; @@ -174,7 +174,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id) struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { - bh_lock_sock(s); + sock_hold(s); goto found; } } @@ -198,7 +198,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id, if (nr->your_index == index && nr->your_id == id && !ax25cmp(&nr->dest_addr, dest)) { - bh_lock_sock(s); + sock_hold(s); goto found; } } @@ -224,7 +224,7 @@ static unsigned short nr_find_next_circuit(void) if (i != 0 && j != 0) { if ((sk=nr_find_socket(i, j)) == NULL) break; - bh_unlock_sock(sk); + sock_put(sk); } id++; @@ -920,6 +920,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) } if (sk != NULL) { + bh_lock_sock(sk); skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) @@ -929,6 +930,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) ret = nr_process_rx_frame(sk, skb); bh_unlock_sock(sk); + sock_put(sk); return ret; } @@ -960,10 +962,12 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) (make = nr_make_new(sk)) == NULL) { nr_transmit_refusal(skb, 0); if (sk) - bh_unlock_sock(sk); + sock_put(sk); return 0; } + bh_lock_sock(sk); + window = skb->data[20]; skb->sk = make; @@ -1016,6 +1020,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) sk->sk_data_ready(sk); bh_unlock_sock(sk); + sock_put(sk); nr_insert_socket(make); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b9bbcf3d6c63..c16f0a362c32 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -623,7 +623,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net) if (rtn->rds_tcp_sysctl) unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - if (net != &init_net && rtn->ctl_table) + if (net != &init_net) kfree(rtn->ctl_table); } diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 090658c3da12..9488600451e8 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ - auth.o auth_null.o auth_unix.o auth_generic.o \ + auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index ad8ead738981..1ff9768f5456 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -39,6 +39,20 @@ static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; +static struct cred machine_cred = { + .usage = ATOMIC_INIT(1), +}; + +/* + * Return the machine_cred pointer to be used whenever + * the a generic machine credential is needed. + */ +const struct cred *rpc_machine_cred(void) +{ + return &machine_cred; +} +EXPORT_SYMBOL_GPL(rpc_machine_cred); + #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { @@ -346,29 +360,6 @@ out_nocache: } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); -/* - * Setup a credential key lifetime timeout notification - */ -int -rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) -{ - if (!cred->cr_auth->au_ops->key_timeout) - return 0; - return cred->cr_auth->au_ops->key_timeout(auth, cred); -} -EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); - -bool -rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) -{ - if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) - return false; - if (!cred->cr_ops->crkey_to_expire) - return false; - return cred->cr_ops->crkey_to_expire(cred); -} -EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire); - char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { @@ -587,13 +578,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; - if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || - refcount_read(&entry->cr_count) == 0) - continue; - cred = entry; - break; - } cred = get_rpccred(entry); if (cred) break; @@ -603,9 +587,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (cred != NULL) goto found; - if (flags & RPCAUTH_LOOKUP_RCU) - return ERR_PTR(-ECHILD); - new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; @@ -656,9 +637,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) auth->au_ops->au_name); memset(&acred, 0, sizeof(acred)); - acred.uid = cred->fsuid; - acred.gid = cred->fsgid; - acred.group_info = cred->group_info; + acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } @@ -672,31 +651,41 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; + cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; - cred->cr_uid = acred->uid; + cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -struct rpc_cred * -rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) +static struct rpc_cred * +rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { - dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, - cred->cr_auth->au_ops->au_name, cred); - return get_rpccred(cred); + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred acred = { + .cred = get_task_cred(&init_task), + }; + struct rpc_cred *ret; + + dprintk("RPC: %5u looking up %s cred\n", + task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); + ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); + put_cred(acred.cred); + return ret; } -EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); static struct rpc_cred * -rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) +rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, + .principal = task->tk_client->cl_principal, + .cred = init_task.cred, }; - dprintk("RPC: %5u looking up %s cred\n", + if (!acred.principal) + return NULL; + dprintk("RPC: %5u looking up %s machine cred\n", task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } @@ -712,18 +701,33 @@ rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) } static int -rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) +rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; - struct rpc_cred *new; + struct rpc_cred *new = NULL; int lookupflags = 0; + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred acred = { + .cred = cred, + }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW; - if (cred != NULL) - new = cred->cr_ops->crbind(task, cred, lookupflags); - else if (flags & RPC_TASK_ROOTCREDS) + if (task->tk_op_cred) + /* Task must use exactly this rpc_cred */ + new = get_rpccred(task->tk_op_cred); + else if (cred != NULL && cred != &machine_cred) + new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); + else if (cred == &machine_cred) + new = rpcauth_bind_machine_cred(task, lookupflags); + + /* If machine cred couldn't be bound, try a root cred */ + if (new) + ; + else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS)) new = rpcauth_bind_root_cred(task, lookupflags); + else if (flags & RPC_TASK_NULLCREDS) + new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) @@ -901,15 +905,10 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = rpc_init_generic_auth(); - if (err < 0) - goto out2; err = register_shrinker(&rpc_cred_shrinker); if (err < 0) - goto out3; + goto out2; return 0; -out3: - rpc_destroy_generic_auth(); out2: rpc_destroy_authunix(); out1: @@ -919,6 +918,5 @@ out1: void rpcauth_remove_module(void) { rpc_destroy_authunix(); - rpc_destroy_generic_auth(); unregister_shrinker(&rpc_cred_shrinker); } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c deleted file mode 100644 index ab4a3be1542a..000000000000 --- a/net/sunrpc/auth_generic.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Generic RPC credential - * - * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com> - */ - -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/sunrpc/auth.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/sched.h> - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID -#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID - -struct generic_cred { - struct rpc_cred gc_base; - struct auth_cred acred; -}; - -static struct rpc_auth generic_auth; -static const struct rpc_credops generic_credops; - -/* - * Public call interface - */ -struct rpc_cred *rpc_lookup_cred(void) -{ - return rpcauth_lookupcred(&generic_auth, 0); -} -EXPORT_SYMBOL_GPL(rpc_lookup_cred); - -struct rpc_cred * -rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp) -{ - return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp); -} -EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred); - -struct rpc_cred *rpc_lookup_cred_nonblock(void) -{ - return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); -} -EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock); - -/* - * Public call interface for looking up machine creds. - */ -struct rpc_cred *rpc_lookup_machine_cred(const char *service_name) -{ - struct auth_cred acred = { - .uid = RPC_MACHINE_CRED_USERID, - .gid = RPC_MACHINE_CRED_GROUPID, - .principal = service_name, - .machine_cred = 1, - }; - - dprintk("RPC: looking up machine cred for service %s\n", - service_name); - return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0); -} -EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); - -static struct rpc_cred *generic_bind_cred(struct rpc_task *task, - struct rpc_cred *cred, int lookupflags) -{ - struct rpc_auth *auth = task->tk_client->cl_auth; - struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; - - return auth->au_ops->lookup_cred(auth, acred, lookupflags); -} - -static int -generic_hash_cred(struct auth_cred *acred, unsigned int hashbits) -{ - return hash_64(from_kgid(&init_user_ns, acred->gid) | - ((u64)from_kuid(&init_user_ns, acred->uid) << - (sizeof(gid_t) * 8)), hashbits); -} - -/* - * Lookup generic creds for current process - */ -static struct rpc_cred * -generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) -{ - return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL); -} - -static struct rpc_cred * -generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) -{ - struct generic_cred *gcred; - - gcred = kmalloc(sizeof(*gcred), gfp); - if (gcred == NULL) - return ERR_PTR(-ENOMEM); - - rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops); - gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; - - gcred->acred.uid = acred->uid; - gcred->acred.gid = acred->gid; - gcred->acred.group_info = acred->group_info; - gcred->acred.ac_flags = 0; - if (gcred->acred.group_info != NULL) - get_group_info(gcred->acred.group_info); - gcred->acred.machine_cred = acred->machine_cred; - gcred->acred.principal = acred->principal; - - dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", - gcred->acred.machine_cred ? "machine" : "generic", - gcred, - from_kuid(&init_user_ns, acred->uid), - from_kgid(&init_user_ns, acred->gid)); - return &gcred->gc_base; -} - -static void -generic_free_cred(struct rpc_cred *cred) -{ - struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); - - dprintk("RPC: generic_free_cred %p\n", gcred); - if (gcred->acred.group_info != NULL) - put_group_info(gcred->acred.group_info); - kfree(gcred); -} - -static void -generic_free_cred_callback(struct rcu_head *head) -{ - struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu); - generic_free_cred(cred); -} - -static void -generic_destroy_cred(struct rpc_cred *cred) -{ - call_rcu(&cred->cr_rcu, generic_free_cred_callback); -} - -static int -machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags) -{ - if (!gcred->acred.machine_cred || - gcred->acred.principal != acred->principal || - !uid_eq(gcred->acred.uid, acred->uid) || - !gid_eq(gcred->acred.gid, acred->gid)) - return 0; - return 1; -} - -/* - * Match credentials against current process creds. - */ -static int -generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) -{ - struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); - int i; - - if (acred->machine_cred) - return machine_cred_match(acred, gcred, flags); - - if (!uid_eq(gcred->acred.uid, acred->uid) || - !gid_eq(gcred->acred.gid, acred->gid) || - gcred->acred.machine_cred != 0) - goto out_nomatch; - - /* Optimisation in the case where pointers are identical... */ - if (gcred->acred.group_info == acred->group_info) - goto out_match; - - /* Slow path... */ - if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) - goto out_nomatch; - for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(gcred->acred.group_info->gid[i], - acred->group_info->gid[i])) - goto out_nomatch; - } -out_match: - return 1; -out_nomatch: - return 0; -} - -int __init rpc_init_generic_auth(void) -{ - return rpcauth_init_credcache(&generic_auth); -} - -void rpc_destroy_generic_auth(void) -{ - rpcauth_destroy_credcache(&generic_auth); -} - -/* - * Test the the current time (now) against the underlying credential key expiry - * minus a timeout and setup notification. - * - * The normal case: - * If 'now' is before the key expiry minus RPC_KEY_EXPIRE_TIMEO, set - * the RPC_CRED_NOTIFY_TIMEOUT flag to setup the underlying credential - * rpc_credops crmatch routine to notify this generic cred when it's key - * expiration is within RPC_KEY_EXPIRE_TIMEO, and return 0. - * - * The error case: - * If the underlying cred lookup fails, return -EACCES. - * - * The 'almost' error case: - * If 'now' is within key expiry minus RPC_KEY_EXPIRE_TIMEO, but not within - * key expiry minus RPC_KEY_EXPIRE_FAIL, set the RPC_CRED_EXPIRE_SOON bit - * on the acred ac_flags and return 0. - */ -static int -generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) -{ - struct auth_cred *acred = &container_of(cred, struct generic_cred, - gc_base)->acred; - struct rpc_cred *tcred; - int ret = 0; - - - /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) - return 0; - - /* Fast track for the normal case */ - if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags)) - return 0; - - /* lookup_cred either returns a valid referenced rpc_cred, or PTR_ERR */ - tcred = auth->au_ops->lookup_cred(auth, acred, 0); - if (IS_ERR(tcred)) - return -EACCES; - - /* Test for the almost error case */ - ret = tcred->cr_ops->crkey_timeout(tcred); - if (ret != 0) { - set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); - ret = 0; - } else { - /* In case underlying cred key has been reset */ - if (test_and_clear_bit(RPC_CRED_KEY_EXPIRE_SOON, - &acred->ac_flags)) - dprintk("RPC: UID %d Credential key reset\n", - from_kuid(&init_user_ns, tcred->cr_uid)); - /* set up fasttrack for the normal case */ - set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); - } - - put_rpccred(tcred); - return ret; -} - -static const struct rpc_authops generic_auth_ops = { - .owner = THIS_MODULE, - .au_name = "Generic", - .hash_cred = generic_hash_cred, - .lookup_cred = generic_lookup_cred, - .crcreate = generic_create_cred, - .key_timeout = generic_key_timeout, -}; - -static struct rpc_auth generic_auth = { - .au_ops = &generic_auth_ops, - .au_count = REFCOUNT_INIT(1), -}; - -static bool generic_key_to_expire(struct rpc_cred *cred) -{ - struct auth_cred *acred = &container_of(cred, struct generic_cred, - gc_base)->acred; - return test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); -} - -static const struct rpc_credops generic_credops = { - .cr_name = "Generic cred", - .crdestroy = generic_destroy_cred, - .crbind = generic_bind_cred, - .crmatch = generic_match, - .crkey_to_expire = generic_key_to_expire, -}; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index ba765473d1f0..dc86713b32b6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -565,7 +565,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; - kuid_t uid = cred->cr_uid; + kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) @@ -604,7 +604,7 @@ gss_refresh_upcall(struct rpc_task *task) int err = 0; dprintk("RPC: %5u %s for uid %u\n", - task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid)); + task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid)); gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -637,7 +637,7 @@ gss_refresh_upcall(struct rpc_task *task) out: dprintk("RPC: %5u %s for uid %u result %d\n", task->tk_pid, __func__, - from_kuid(&init_user_ns, cred->cr_uid), err); + from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } @@ -653,7 +653,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) int err; dprintk("RPC: %s for uid %u\n", - __func__, from_kuid(&init_user_ns, cred->cr_uid)); + __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid)); retry: err = 0; /* if gssd is down, just skip upcalling altogether */ @@ -701,7 +701,7 @@ out_intr: gss_release_msg(gss_msg); out: dprintk("RPC: %s for uid %u result %d\n", - __func__, from_kuid(&init_user_ns, cred->cr_uid), err); + __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } @@ -1248,7 +1248,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) new = kzalloc(sizeof(*gss_cred), GFP_NOIO); if (new) { struct auth_cred acred = { - .uid = gss_cred->gc_base.cr_uid, + .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); @@ -1343,6 +1343,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); + put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); @@ -1361,7 +1362,7 @@ gss_destroy_cred(struct rpc_cred *cred) static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { - return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits); + return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* @@ -1381,7 +1382,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t int err = -ENOMEM; dprintk("RPC: %s for uid %d, flavor %d\n", - __func__, from_kuid(&init_user_ns, acred->uid), + __func__, from_kuid(&init_user_ns, acred->cred->fsuid), auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), gfp))) @@ -1394,9 +1395,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; - cred->gc_principal = NULL; - if (acred->machine_cred) - cred->gc_principal = acred->principal; + cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; @@ -1518,23 +1517,10 @@ out: if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; - goto check_expire; - } - if (gss_cred->gc_principal != NULL) - return 0; - ret = uid_eq(rc->cr_uid, acred->uid); - -check_expire: - if (ret == 0) - return ret; - - /* Notify acred users of GSS context expiration timeout */ - if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags) && - (gss_key_timeout(rc) != 0)) { - /* test will now be done from generic cred */ - test_and_clear_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); - /* tell NFS layer that key will expire soon */ - set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); + } else { + if (gss_cred->gc_principal != NULL) + return 0; + ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } @@ -1607,9 +1593,8 @@ static int gss_renew_cred(struct rpc_task *task) gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { - .uid = oldcred->cr_uid, + .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, - .machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0), }; struct rpc_cred *new; @@ -2110,7 +2095,6 @@ static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, - .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, @@ -2125,7 +2109,6 @@ static const struct rpc_credops gss_credops = { static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, - .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 16ac0f4cb7d8..379318dff534 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -244,7 +244,7 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) /** * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors - * @array: array to fill in + * @array_ptr: array to fill in * @size: size of "array" * * Returns the number of array items filled in, or a negative errno. diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1ece4bc3eb8d..152790ed309c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1142,7 +1142,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct kvec *resv = &rqstp->rq_res.head[0]; struct rsi *rsip, rsikey; int ret; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); ret = gss_read_verf(gc, argv, authp, @@ -1253,7 +1253,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, uint64_t handle; int status; int ret; - struct net *net = rqstp->rq_xprt->xpt_net; + struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); @@ -1444,7 +1444,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) __be32 *rpcstart; __be32 *reject_stat = resv->iov_base + resv->iov_len; int ret; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", argv->iov_len); @@ -1734,7 +1734,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *resbuf = &rqstp->rq_res; int stat = -EINVAL; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 2694a1bc026b..d0ceac57c06e 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -36,8 +36,6 @@ nul_destroy(struct rpc_auth *auth) static struct rpc_cred * nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - if (flags & RPCAUTH_LOOKUP_RCU) - return &null_cred; return get_rpccred(&null_cred); } @@ -116,7 +114,6 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, - .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = REFCOUNT_INIT(1), @@ -126,7 +123,6 @@ static const struct rpc_credops null_credops = { .cr_name = "AUTH_NULL", .crdestroy = nul_destroy_cred, - .crbind = rpcauth_generic_bind_cred, .crmatch = nul_match, .crmarshal = nul_marshal, .crrefresh = nul_refresh, diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4c1c7e56288f..387f6b3ffbea 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -11,16 +11,11 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/mempool.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/user_namespace.h> -struct unx_cred { - struct rpc_cred uc_base; - kgid_t uc_gid; - kgid_t uc_gids[UNX_NGROUPS]; -}; -#define uc_uid uc_base.cr_uid #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH @@ -28,6 +23,7 @@ struct unx_cred { static struct rpc_auth unix_auth; static const struct rpc_credops unix_credops; +static mempool_t *unix_pool; static struct rpc_auth * unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) @@ -42,15 +38,6 @@ static void unx_destroy(struct rpc_auth *auth) { dprintk("RPC: destroying UNIX authenticator %p\n", auth); - rpcauth_clear_credcache(auth->au_credcache); -} - -static int -unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) -{ - return hash_64(from_kgid(&init_user_ns, acred->gid) | - ((u64)from_kuid(&init_user_ns, acred->uid) << - (sizeof(gid_t) * 8)), hashbits); } /* @@ -59,52 +46,24 @@ unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) static struct rpc_cred * unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); -} - -static struct rpc_cred * -unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) -{ - struct unx_cred *cred; - unsigned int groups = 0; - unsigned int i; + struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS); dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", - from_kuid(&init_user_ns, acred->uid), - from_kgid(&init_user_ns, acred->gid)); - - if (!(cred = kmalloc(sizeof(*cred), gfp))) - return ERR_PTR(-ENOMEM); + from_kuid(&init_user_ns, acred->cred->fsuid), + from_kgid(&init_user_ns, acred->cred->fsgid)); - rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); - cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; - - if (acred->group_info != NULL) - groups = acred->group_info->ngroups; - if (groups > UNX_NGROUPS) - groups = UNX_NGROUPS; - - cred->uc_gid = acred->gid; - for (i = 0; i < groups; i++) - cred->uc_gids[i] = acred->group_info->gid[i]; - if (i < UNX_NGROUPS) - cred->uc_gids[i] = INVALID_GID; - - return &cred->uc_base; -} - -static void -unx_free_cred(struct unx_cred *unx_cred) -{ - dprintk("RPC: unx_free_cred %p\n", unx_cred); - kfree(unx_cred); + rpcauth_init_cred(ret, acred, auth, &unix_credops); + ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + return ret; } static void unx_free_cred_callback(struct rcu_head *head) { - struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu); - unx_free_cred(unx_cred); + struct rpc_cred *rpc_cred = container_of(head, struct rpc_cred, cr_rcu); + dprintk("RPC: unx_free_cred %p\n", rpc_cred); + put_cred(rpc_cred->cr_cred); + mempool_free(rpc_cred, unix_pool); } static void @@ -114,30 +73,32 @@ unx_destroy_cred(struct rpc_cred *cred) } /* - * Match credentials against current process creds. - * The root_override argument takes care of cases where the caller may - * request root creds (e.g. for NFS swapping). + * Match credentials against current the auth_cred. */ static int -unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) +unx_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) { - struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base); unsigned int groups = 0; unsigned int i; + if (cred->cr_cred == acred->cred) + return 1; - if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid)) + if (!uid_eq(cred->cr_cred->fsuid, acred->cred->fsuid) || !gid_eq(cred->cr_cred->fsgid, acred->cred->fsgid)) return 0; - if (acred->group_info != NULL) - groups = acred->group_info->ngroups; + if (acred->cred && acred->cred->group_info != NULL) + groups = acred->cred->group_info->ngroups; if (groups > UNX_NGROUPS) groups = UNX_NGROUPS; + if (cred->cr_cred->group_info == NULL) + return groups == 0; + if (groups != cred->cr_cred->group_info->ngroups) + return 0; + for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) + if (!gid_eq(cred->cr_cred->group_info->gid[i], acred->cred->group_info->gid[i])) return 0; - if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups])) - return 0; return 1; } @@ -149,9 +110,10 @@ static __be32 * unx_marshal(struct rpc_task *task, __be32 *p) { struct rpc_clnt *clnt = task->tk_client; - struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); + struct rpc_cred *cred = task->tk_rqstp->rq_cred; __be32 *base, *hold; int i; + struct group_info *gi = cred->cr_cred->group_info; *p++ = htonl(RPC_AUTH_UNIX); base = p++; @@ -162,11 +124,12 @@ unx_marshal(struct rpc_task *task, __be32 *p) */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); - *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); - *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, cred->cr_cred->fsuid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, cred->cr_cred->fsgid)); hold = p++; - for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++) - *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); + if (gi) + for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++) + *p++ = htonl((u32) from_kgid(&init_user_ns, gi->gid[i])); *hold = htonl(p - hold - 1); /* gid array length */ *base = htonl((p - base - 1) << 2); /* cred length */ @@ -213,12 +176,13 @@ unx_validate(struct rpc_task *task, __be32 *p) int __init rpc_init_authunix(void) { - return rpcauth_init_credcache(&unix_auth); + unix_pool = mempool_create_kmalloc_pool(16, sizeof(struct rpc_cred)); + return unix_pool ? 0 : -ENOMEM; } void rpc_destroy_authunix(void) { - rpcauth_destroy_credcache(&unix_auth); + mempool_destroy(unix_pool); } const struct rpc_authops authunix_ops = { @@ -227,16 +191,13 @@ const struct rpc_authops authunix_ops = { .au_name = "UNIX", .create = unx_create, .destroy = unx_destroy, - .hash_cred = unx_hash_cred, .lookup_cred = unx_lookup_cred, - .crcreate = unx_create_cred, }; static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, - .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = REFCOUNT_INIT(1), @@ -246,7 +207,6 @@ static const struct rpc_credops unix_credops = { .cr_name = "AUTH_UNIX", .crdestroy = unx_destroy_cred, - .crbind = rpcauth_generic_bind_cred, .crmatch = unx_match, .crmarshal = unx_marshal, .crrefresh = unx_refresh, diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index fa5ba6ed3197..ec451b8114b0 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -197,7 +197,7 @@ out_free: /** * xprt_destroy_backchannel - Destroys the backchannel preallocated structures. * @xprt: the transport holding the preallocated strucures - * @max_reqs the maximum number of preallocated structures to destroy + * @max_reqs: the maximum number of preallocated structures to destroy * * Since these structures may have been allocated by multiple calls * to xprt_setup_backchannel, we only destroy up to the maximum number diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f96345b1180e..12bb23b8e0c5 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,6 +54,11 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } +static void cache_fresh_locked(struct cache_head *head, time_t expiry, + struct cache_detail *detail); +static void cache_fresh_unlocked(struct cache_head *head, + struct cache_detail *detail); + static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, struct cache_head *key, int hash) @@ -100,6 +105,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, if (cache_is_expired(detail, tmp)) { hlist_del_init_rcu(&tmp->cache_list); detail->entries --; + cache_fresh_locked(tmp, 0, detail); freeme = tmp; break; } @@ -115,8 +121,10 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, cache_get(new); spin_unlock(&detail->hash_lock); - if (freeme) + if (freeme) { + cache_fresh_unlocked(freeme, detail); cache_put(freeme, detail); + } return new; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 24cbddc44c88..71d9599b5816 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -627,6 +627,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; + new->cl_principal = clnt->cl_principal; return new; out_err: @@ -1029,7 +1030,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; if (msg->rpc_cred != NULL) - task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred); + task->tk_msg.rpc_cred = get_cred(msg->rpc_cred); } } @@ -2521,9 +2522,8 @@ static int rpc_ping(struct rpc_clnt *clnt) .rpc_proc = &rpcproc_null, }; int err; - msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN); - put_rpccred(msg.rpc_cred); + err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | + RPC_TASK_NULLCREDS); return err; } @@ -2534,15 +2534,15 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, { struct rpc_message msg = { .rpc_proc = &rpcproc_null, - .rpc_cred = cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_xprt = xprt, .rpc_message = &msg, + .rpc_op_cred = cred, .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, .callback_data = data, - .flags = flags, + .flags = flags | RPC_TASK_NULLCREDS, }; return rpc_run_task(&task_setup_data); @@ -2593,7 +2593,6 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, void *dummy) { struct rpc_cb_add_xprt_calldata *data; - struct rpc_cred *cred; struct rpc_task *task; data = kmalloc(sizeof(*data), GFP_NOFS); @@ -2602,11 +2601,9 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, data->xps = xprt_switch_get(xps); data->xprt = xprt_get(xprt); - cred = authnull_ops.lookup_cred(NULL, NULL, 0); - task = rpc_call_null_helper(clnt, xprt, cred, - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC, + task = rpc_call_null_helper(clnt, xprt, NULL, + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS, &rpc_cb_add_xprt_call_ops, data); - put_rpccred(cred); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); @@ -2637,7 +2634,6 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct rpc_cred *cred; struct rpc_task *task; struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; int status = -EADDRINUSE; @@ -2649,11 +2645,9 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, goto out_err; /* Test the connection */ - cred = authnull_ops.lookup_cred(NULL, NULL, 0); - task = rpc_call_null_helper(clnt, xprt, cred, - RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + task = rpc_call_null_helper(clnt, xprt, NULL, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, NULL, NULL); - put_rpccred(cred); if (IS_ERR(task)) { status = PTR_ERR(task); goto out_err; @@ -2667,6 +2661,9 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ xtest->add_xprt_test(clnt, xprt, xtest->data); + xprt_put(xprt); + xprt_switch_put(xps); + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ return 1; out_err: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 4fda18d47e2c..69663681bf9d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1266,7 +1266,7 @@ static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { * that this file will be there and have a certain format. */ static int -rpc_show_dummy_info(struct seq_file *m, void *v) +rpc_dummy_info_show(struct seq_file *m, void *v) { seq_printf(m, "RPC server: %s\n", utsname()->nodename); seq_printf(m, "service: foo (1) version 0\n"); @@ -1275,25 +1275,12 @@ rpc_show_dummy_info(struct seq_file *m, void *v) seq_printf(m, "port: 0\n"); return 0; } - -static int -rpc_dummy_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, rpc_show_dummy_info, NULL); -} - -static const struct file_operations rpc_dummy_info_operations = { - .owner = THIS_MODULE, - .open = rpc_dummy_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); static const struct rpc_filelist gssd_dummy_info_file[] = { [0] = { .name = "info", - .i_fop = &rpc_dummy_info_operations, + .i_fop = &rpc_dummy_info_fops, .mode = S_IFREG | 0400, }, }; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c7872bc13860..41a971ac1c63 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -752,7 +752,7 @@ void rpcb_getport_async(struct rpc_task *task) goto bailout_nofree; } - map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); + map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS); if (!map) { status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", @@ -770,7 +770,13 @@ void rpcb_getport_async(struct rpc_task *task) case RPCBVERS_4: case RPCBVERS_3: map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID]; - map->r_addr = rpc_sockaddr2uaddr(sap, GFP_ATOMIC); + map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS); + if (!map->r_addr) { + status = -ENOMEM; + dprintk("RPC: %5u %s: no memory available\n", + task->tk_pid, __func__); + goto bailout_free_args; + } map->r_owner = ""; break; case RPCBVERS_2: @@ -793,6 +799,8 @@ void rpcb_getport_async(struct rpc_task *task) rpc_put_task(child); return; +bailout_free_args: + kfree(map); bailout_release_client: rpc_release_client(rpcb_clnt); bailout_nofree: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 57ca5bead1cb..adc3c40cc733 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -997,6 +997,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta task->tk_xprt = xprt_get(task_setup_data->rpc_xprt); + task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred); + if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; @@ -1054,6 +1056,7 @@ static void rpc_free_task(struct rpc_task *task) { unsigned short tk_flags = task->tk_flags; + put_rpccred(task->tk_op_cred); rpc_release_calldata(task->tk_ops, task->tk_calldata); if (tk_flags & RPC_TASK_DYNAMIC) { @@ -1071,7 +1074,7 @@ static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { - put_rpccred(task->tk_msg.rpc_cred); + put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d13e05f1a990..e87ddb9f7feb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1145,6 +1145,17 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, .. #endif /* + * Setup response header for TCP, it has a 4B record length field. + */ +static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + +/* * Common routine for processing the RPC request. */ static int @@ -1172,7 +1183,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) clear_bit(RQ_DROPME, &rqstp->rq_flags); /* Setup reply header */ - rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); + if (rqstp->rq_prot == IPPROTO_TCP) + svc_tcp_prep_reply_hdr(rqstp); svc_putu32(resv, rqstp->rq_xid); @@ -1244,7 +1256,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * for lower versions. RPC_PROG_MISMATCH seems to be the closest * fit. */ - if (versp->vs_need_cong_ctrl && + if (versp->vs_need_cong_ctrl && rqstp->rq_xprt && !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) goto err_bad_vers; @@ -1336,7 +1348,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) return 0; close: - if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_close_xprt(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); return 0; @@ -1459,10 +1471,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, dprintk("svc: %s(%p)\n", __func__, req); /* Build the svc_rqst used by the common processing routine */ - rqstp->rq_xprt = serv->sv_bc_xprt; rqstp->rq_xid = req->rq_xid; rqstp->rq_prot = req->rq_xprt->prot; rqstp->rq_server = serv; + rqstp->rq_bc_net = req->rq_xprt->xprt_net; rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); @@ -1499,9 +1511,9 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); - return 0; + error = -EINVAL; + goto out; } - /* Finally, send the reply synchronously */ memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 51d36230b6e3..4eb8fbf2508d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -296,9 +296,9 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, request_module("svc%s", xprt_name); err = _svc_create_xprt(serv, xprt_name, net, family, port, flags); } - if (err) + if (err < 0) dprintk("svc: transport %s not found, err %d\n", - xprt_name, err); + xprt_name, -err); return err; } EXPORT_SYMBOL_GPL(svc_create_xprt); @@ -468,10 +468,11 @@ out: */ void svc_reserve(struct svc_rqst *rqstp, int space) { + struct svc_xprt *xprt = rqstp->rq_xprt; + space += rqstp->rq_res.head[0].iov_len; - if (space < rqstp->rq_reserved) { - struct svc_xprt *xprt = rqstp->rq_xprt; + if (xprt && space < rqstp->rq_reserved) { atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 986f3ed7d1a2..a6a060925e5d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -70,13 +70,6 @@ static void svc_sock_free(struct svc_xprt *); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, struct net *, struct sockaddr *, int, int); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, - struct net *, struct sockaddr *, - int, int); -static void svc_bc_sock_free(struct svc_xprt *xprt); -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; static struct lock_class_key svc_slock_key[2]; @@ -549,7 +542,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->tstamp; + sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ len = skb->len; @@ -617,10 +610,6 @@ svc_udp_sendto(struct svc_rqst *rqstp) return error; } -static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) -{ -} - static int svc_udp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); @@ -664,7 +653,6 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_release_rqst = svc_release_udp_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, - .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, .xpo_secure_port = svc_sock_secure_port, @@ -1170,17 +1158,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) return sent; } -/* - * Setup response header. TCP has a 4B record length field. - */ -static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) -{ - struct kvec *resv = &rqstp->rq_res.head[0]; - - /* tcp needs a space for the record length... */ - svc_putnl(resv, 0); -} - static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -1189,58 +1166,6 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, - struct net *, struct sockaddr *, - int, int); -static void svc_bc_sock_free(struct svc_xprt *xprt); - -static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv, - struct net *net, - struct sockaddr *sa, int salen, - int flags) -{ - return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); -} - -static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt) -{ -} - -static const struct svc_xprt_ops svc_tcp_bc_ops = { - .xpo_create = svc_bc_tcp_create, - .xpo_detach = svc_bc_tcp_sock_detach, - .xpo_free = svc_bc_sock_free, - .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, - .xpo_secure_port = svc_sock_secure_port, -}; - -static struct svc_xprt_class svc_tcp_bc_class = { - .xcl_name = "tcp-bc", - .xcl_owner = THIS_MODULE, - .xcl_ops = &svc_tcp_bc_ops, - .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, -}; - -static void svc_init_bc_xprt_sock(void) -{ - svc_reg_xprt_class(&svc_tcp_bc_class); -} - -static void svc_cleanup_bc_xprt_sock(void) -{ - svc_unreg_xprt_class(&svc_tcp_bc_class); -} -#else /* CONFIG_SUNRPC_BACKCHANNEL */ -static void svc_init_bc_xprt_sock(void) -{ -} - -static void svc_cleanup_bc_xprt_sock(void) -{ -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, @@ -1248,7 +1173,6 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_release_rqst = svc_release_skb, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, - .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, @@ -1267,14 +1191,12 @@ void svc_init_xprt_sock(void) { svc_reg_xprt_class(&svc_tcp_class); svc_reg_xprt_class(&svc_udp_class); - svc_init_bc_xprt_sock(); } void svc_cleanup_xprt_sock(void) { svc_unreg_xprt_class(&svc_tcp_class); svc_unreg_xprt_class(&svc_udp_class); - svc_cleanup_bc_xprt_sock(); } static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) @@ -1595,45 +1517,3 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(svsk->sk_sock); kfree(svsk); } - -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Create a back channel svc_xprt which shares the fore channel socket. - */ -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv, - int protocol, - struct net *net, - struct sockaddr *sin, int len, - int flags) -{ - struct svc_sock *svsk; - struct svc_xprt *xprt; - - if (protocol != IPPROTO_TCP) { - printk(KERN_WARNING "svc: only TCP sockets" - " supported on shared back channel\n"); - return ERR_PTR(-EINVAL); - } - - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); - if (!svsk) - return ERR_PTR(-ENOMEM); - - xprt = &svsk->sk_xprt; - svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv); - set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); - - serv->sv_bc_xprt = xprt; - - return xprt; -} - -/* - * Free a back channel svc_sock. - */ -static void svc_bc_sock_free(struct svc_xprt *xprt) -{ - if (xprt) - kfree(container_of(xprt, struct svc_sock, sk_xprt)); -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e2d64c7138c3..8394124126f8 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -383,7 +383,7 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter - * @xps: pointer to a new rpc_xprt_switch or NULL + * @newswitch: pointer to a new rpc_xprt_switch or NULL * * Swaps out the existing xpi->xpi_xpswitch with a new value. */ @@ -401,7 +401,7 @@ struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi, /** * xprt_iter_destroy - Destroys the xprt iterator - * @xpi pointer to rpc_xprt_iter + * @xpi: pointer to rpc_xprt_iter */ void xprt_iter_destroy(struct rpc_xprt_iter *xpi) { diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8bf19e142b6b..8ed0377d7a18 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -rpcrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o \ +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ module.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index e5b367a3e517..0de9b3e63770 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -5,7 +5,6 @@ * Support for backward direction RPCs on RPC/RDMA. */ -#include <linux/module.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> @@ -20,29 +19,16 @@ #undef RPCRDMA_BACKCHANNEL_DEBUG -static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, - struct rpc_rqst *rqst) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - - spin_lock(&buf->rb_reqslock); - list_del(&req->rl_all); - spin_unlock(&buf->rb_reqslock); - - rpcrdma_destroy_req(req); -} - static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, unsigned int count) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_req *req; struct rpc_rqst *rqst; unsigned int i; for (i = 0; i < (count << 1); i++) { struct rpcrdma_regbuf *rb; - struct rpcrdma_req *req; size_t size; req = rpcrdma_create_req(r_xprt); @@ -68,7 +54,7 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, return 0; out_fail: - rpcrdma_bc_free_rqst(r_xprt, rqst); + rpcrdma_req_destroy(req); return -ENOMEM; } @@ -101,7 +87,6 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) goto out_free; r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; - request_module("svcrdma"); trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; @@ -114,26 +99,6 @@ out_err: } /** - * xprt_rdma_bc_up - Create transport endpoint for backchannel service - * @serv: server endpoint - * @net: network namespace - * - * The "xprt" is an implied argument: it supplies the name of the - * backchannel transport class. - * - * Returns zero on success, negative errno on failure - */ -int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) -{ - int ret; - - ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); - if (ret < 0) - return ret; - return 0; -} - -/** * xprt_rdma_bc_maxpayload - Return maximum backchannel message size * @xprt: transport * @@ -193,21 +158,21 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) */ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); int rc; - if (!xprt_connected(rqst->rq_xprt)) - goto drop_connection; + if (!xprt_connected(xprt)) + return -ENOTCONN; - if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + if (!xprt_request_get_cong(xprt, rqst)) return -EBADSLT; rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; - rpcrdma_post_recvs(r_xprt, true); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; return 0; @@ -216,7 +181,7 @@ failed_marshal: if (rc != -ENOTCONN) return rc; drop_connection: - xprt_disconnect_done(rqst->rq_xprt); + xprt_rdma_close(xprt); return -ENOTCONN; } @@ -227,7 +192,6 @@ drop_connection: */ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; spin_lock(&xprt->bc_pa_lock); @@ -235,7 +199,7 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - rpcrdma_bc_free_rqst(r_xprt, rqst); + rpcrdma_req_destroy(rpcr_to_rdmar(rqst)); spin_lock(&xprt->bc_pa_lock); } @@ -251,9 +215,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpc_xprt *xprt = rqst->rq_xprt; - dprintk("RPC: %s: freeing rqst %p (req %p)\n", - __func__, rqst, req); - rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; @@ -339,7 +300,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, out_overflow: pr_warn("RPC/RDMA backchannel overflow\n"); - xprt_disconnect_done(xprt); + xprt_force_disconnect(xprt); /* This receive buffer gets reposted automatically * when the connection is re-established. */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c deleted file mode 100644 index fd8fea59fe92..000000000000 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ /dev/null @@ -1,337 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2015, 2017 Oracle. All rights reserved. - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. - */ - -/* Lightweight memory registration using Fast Memory Regions (FMR). - * Referred to sometimes as MTHCAFMR mode. - * - * FMR uses synchronous memory registration and deregistration. - * FMR registration is known to be fast, but FMR deregistration - * can take tens of usecs to complete. - */ - -/* Normal operation - * - * A Memory Region is prepared for RDMA READ or WRITE using the - * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is - * finished, the Memory Region is unmapped using the ib_unmap_fmr - * verb (fmr_op_unmap). - */ - -#include <linux/sunrpc/svc_rdma.h> - -#include "xprt_rdma.h" -#include <trace/events/rpcrdma.h> - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* Maximum scatter/gather per FMR */ -#define RPCRDMA_MAX_FMR_SGES (64) - -/* Access mode of externally registered pages */ -enum { - RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ, -}; - -bool -fmr_is_supported(struct rpcrdma_ia *ia) -{ - if (!ia->ri_device->ops.alloc_fmr) { - pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", - ia->ri_device->name); - return false; - } - return true; -} - -static void -__fmr_unmap(struct rpcrdma_mr *mr) -{ - LIST_HEAD(l); - int rc; - - list_add(&mr->fmr.fm_mr->list, &l); - rc = ib_unmap_fmr(&l); - list_del(&mr->fmr.fm_mr->list); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); -} - -/* Release an MR. - */ -static void -fmr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - kfree(mr->fmr.fm_physaddrs); - kfree(mr->mr_sg); - - /* In case this one was left mapped, try to unmap it - * to prevent dealloc_fmr from failing with EBUSY - */ - __fmr_unmap(mr); - - rc = ib_dealloc_fmr(mr->fmr.fm_mr); - if (rc) - pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", - mr, rc); - - kfree(mr); -} - -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -fmr_mr_recycle_worker(struct work_struct *work) -{ - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); -} - -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -/* On success, sets: - * ep->rep_attr.cap.max_send_wr - * ep->rep_attr.cap.max_recv_wr - * cdata->max_requests - * ia->ri_max_segs - */ -static int -fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) -{ - int max_qp_wr; - - max_qp_wr = ia->ri_device->attrs.max_qp_wr; - max_qp_wr -= RPCRDMA_BACKWARD_WRS; - max_qp_wr -= 1; - if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) - return -ENOMEM; - if (cdata->max_requests > max_qp_wr) - cdata->max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES); - ia->ri_max_segs += 2; /* segments for head and tail buffers */ - return 0; -} - -/* FMR mode conveys up to 64 pages of payload per chunk segment. - */ -static size_t -fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) -{ - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); -} - -/* Use the ib_map_phys_fmr() verb to register a memory region - * for remote access via RDMA READ or RDMA WRITE. - */ -static struct rpcrdma_mr_seg * -fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mr **out) -{ - struct rpcrdma_mr_seg *seg1 = seg; - int len, pageoff, i, rc; - struct rpcrdma_mr *mr; - u64 *dma_pages; - - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - return ERR_PTR(-EAGAIN); - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_FMR_SGES) - nsegs = RPCRDMA_MAX_FMR_SGES; - for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - mr->mr_dir = rpcrdma_data_dir(writing); - - mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) - goto out_dmamap_err; - trace_xprtrdma_mr_map(mr); - - for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) - dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); - rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, - dma_pages[0]); - if (rc) - goto out_maperr; - - mr->mr_handle = mr->fmr.fm_mr->rkey; - mr->mr_length = len; - mr->mr_offset = dma_pages[0] + pageoff; - - *out = mr; - return seg; - -out_dmamap_err: - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mr->mr_sg, i); - rpcrdma_mr_put(mr); - return ERR_PTR(-EIO); - -out_maperr: - pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", - len, (unsigned long long)dma_pages[0], - pageoff, mr->mr_nents, rc); - rpcrdma_mr_unmap_and_put(mr); - return ERR_PTR(-EIO); -} - -/* Post Send WR containing the RPC Call message. - */ -static int -fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) -{ - return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL); -} - -/* Invalidate all memory regions that were registered for "req". - * - * Sleeps until it is safe for the host CPU to access the - * previously mapped memory regions. - * - * Caller ensures that @mrs is not empty before the call. This - * function empties the list. - */ -static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) -{ - struct rpcrdma_mr *mr; - LIST_HEAD(unmap_list); - int rc; - - /* ORDER: Invalidate all of the req's MRs first - * - * ib_unmap_fmr() is slow, so use a single call instead - * of one call per mapped FMR. - */ - list_for_each_entry(mr, mrs, mr_list) { - dprintk("RPC: %s: unmapping fmr %p\n", - __func__, &mr->fmr); - trace_xprtrdma_mr_localinv(mr); - list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); - } - r_xprt->rx_stats.local_inv_needed++; - rc = ib_unmap_fmr(&unmap_list); - if (rc) - goto out_release; - - /* ORDER: Now DMA unmap all of the req's MRs, and return - * them to the free MW list. - */ - while (!list_empty(mrs)) { - mr = rpcrdma_mr_pop(mrs); - list_del(&mr->fmr.fm_mr->list); - rpcrdma_mr_unmap_and_put(mr); - } - - return; - -out_release: - pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - - while (!list_empty(mrs)) { - mr = rpcrdma_mr_pop(mrs); - list_del(&mr->fmr.fm_mr->list); - rpcrdma_mr_recycle(mr); - } -} - -const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { - .ro_map = fmr_op_map, - .ro_send = fmr_op_send, - .ro_unmap_sync = fmr_op_unmap_sync, - .ro_open = fmr_op_open, - .ro_maxpages = fmr_op_maxpages, - .ro_init_mr = fmr_op_init_mr, - .ro_release_mr = fmr_op_release_mr, - .ro_displayname = "fmr", - .ro_send_w_inv_ok = 0, -}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index fc6378cc0c1c..6a561056b538 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -15,21 +15,21 @@ /* Normal operation * * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG - * Work Request (frwr_op_map). When the RDMA operation is finished, this + * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_op_unmap_sync). + * (frwr_unmap_sync). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to * prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. * - * As an optimization, frwr_op_unmap marks MRs INVALID before the + * As an optimization, frwr_unmap marks MRs INVALID before the * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on * rb_mrs immediately so that no work (like managing a linked list * under a spinlock) is needed in the completion upcall. * - * But this means that frwr_op_map() can occasionally encounter an MR + * But this means that frwr_map() can occasionally encounter an MR * that is INVALID but the LOCAL_INV WR has not completed. Work Queue * ordering prevents a subsequent FAST_REG WR from executing against * that MR while it is still being invalidated. @@ -57,14 +57,14 @@ * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR * state, and the pending WR was flushed. * - * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered + * When frwr_map encounters FLUSHED and VALID MRs, they are recovered * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_op_map allocates another MR for the current RPC while + * complete. frwr_map allocates another MR for the current RPC while * the broken MR is reset. * - * To ensure that frwr_op_map doesn't encounter an MR that is marked + * To ensure that frwr_map doesn't encounter an MR that is marked * INVALID but that is about to be flushed due to a previous transport * disconnect, the transport connect worker attempts to drain all * pending send queue WRs before the transport is reconnected. @@ -80,8 +80,13 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -bool -frwr_is_supported(struct rpcrdma_ia *ia) +/** + * frwr_is_supported - Check if device supports FRWR + * @ia: interface adapter to check + * + * Returns true if device supports FRWR, otherwise false + */ +bool frwr_is_supported(struct rpcrdma_ia *ia) { struct ib_device_attr *attrs = &ia->ri_device->attrs; @@ -97,15 +102,18 @@ out_not_supported: return false; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) +/** + * frwr_release_mr - Destroy one MR + * @mr: MR allocated by frwr_init_mr + * + */ +void frwr_release_mr(struct rpcrdma_mr *mr) { int rc; rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); + trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); kfree(mr); } @@ -117,60 +125,78 @@ static void frwr_mr_recycle_worker(struct work_struct *work) { struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - enum rpcrdma_frwr_state state = mr->frwr.fr_state; struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); - if (state != FRWR_FLUSHED_LI) { + if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); + mr->mr_dir = DMA_NONE; } spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - frwr_op_release_mr(mr); + + frwr_release_mr(mr); } -static int -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +/** + * frwr_init_mr - Initialize one MR + * @ia: interface adapter + * @mr: generic MR to prepare for FRWR + * + * Returns zero if successful. Otherwise a negative errno + * is returned. + */ +int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { unsigned int depth = ia->ri_max_frwr_depth; - struct rpcrdma_frwr *frwr = &mr->frwr; + struct scatterlist *sg; + struct ib_mr *frmr; int rc; - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); - if (IS_ERR(frwr->fr_mr)) + frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frmr)) goto out_mr_err; - mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) + sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + if (!sg) goto out_list_err; + mr->frwr.fr_mr = frmr; + mr->frwr.fr_state = FRWR_IS_INVALID; + mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); - sg_init_table(mr->mr_sg, depth); - init_completion(&frwr->fr_linv_done); + init_completion(&mr->frwr.fr_linv_done); + + sg_init_table(sg, depth); + mr->mr_sg = sg; return 0; out_mr_err: - rc = PTR_ERR(frwr->fr_mr); - dprintk("RPC: %s: ib_alloc_mr status %i\n", - __func__, rc); + rc = PTR_ERR(frmr); + trace_xprtrdma_frwr_alloc(mr, rc); return rc; out_list_err: - rc = -ENOMEM; dprintk("RPC: %s: sg allocation failure\n", __func__); - ib_dereg_mr(frwr->fr_mr); - return rc; + ib_dereg_mr(frmr); + return -ENOMEM; } -/* On success, sets: +/** + * frwr_open - Prepare an endpoint for use with FRWR + * @ia: interface adapter this endpoint will use + * @ep: endpoint to prepare + * @cdata: transport parameters + * + * On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr * cdata->max_requests @@ -179,10 +205,11 @@ out_list_err: * And these FRWR-related fields: * ia->ri_max_frwr_depth * ia->ri_mrtype + * + * On failure, a negative errno is returned. */ -static int -frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr *attrs = &ia->ri_device->attrs; int max_qp_wr, depth, delta; @@ -191,10 +218,17 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; - ia->ri_max_frwr_depth = - min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - attrs->max_fast_reg_page_list_len); - dprintk("RPC: %s: device's max FR page list len = %u\n", + /* Quirk: Some devices advertise a large max_fast_reg_page_list_len + * capability, but perform optimally when the MRs are not larger + * than a page. + */ + if (attrs->max_sge_rd > 1) + ia->ri_max_frwr_depth = attrs->max_sge_rd; + else + ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; + if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) + ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + dprintk("RPC: %s: max FR page list depth = %u\n", __func__, ia->ri_max_frwr_depth); /* Add room for frwr register and invalidate WRs. @@ -242,20 +276,28 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); - ia->ri_max_segs += 2; /* segments for head and tail buffers */ + /* Reply chunks require segments for head and tail buffers */ + ia->ri_max_segs += 2; + if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) + ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS; return 0; } -/* FRWR mode conveys a list of pages per chunk segment. The +/** + * frwr_maxpages - Compute size of largest payload + * @r_xprt: transport + * + * Returns maximum size of an RPC message, in pages. + * + * FRWR mode conveys a list of pages per chunk segment. The * maximum length of that list is the FRWR page list depth. */ -static size_t -frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); + (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); } static void @@ -332,12 +374,25 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li_wake(wc, frwr); } -/* Post a REG_MR Work Request to register a memory region +/** + * frwr_map - Register a memory region + * @r_xprt: controlling transport + * @seg: memory region co-ordinates + * @nsegs: number of segments remaining + * @writing: true when RDMA Write will be used + * @xid: XID of RPC using the registered memory + * @out: initialized MR + * + * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. + * + * Returns the next segment or a negative errno pointer. + * On success, the prepared MR is planted in @out. */ -static struct rpcrdma_mr_seg * -frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mr **out) +struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, u32 xid, + struct rpcrdma_mr **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; @@ -384,13 +439,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; + ibmr->iova &= 0x00000000ffffffff; + ibmr->iova |= ((u64)cpu_to_be32(xid)) << 32; key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); @@ -404,32 +460,35 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_handle = ibmr->rkey; mr->mr_length = ibmr->length; mr->mr_offset = ibmr->iova; + trace_xprtrdma_mr_map(mr); *out = mr; return seg; out_dmamap_err: - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mr->mr_sg, i); frwr->fr_state = FRWR_IS_INVALID; + trace_xprtrdma_frwr_sgerr(mr, i); rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: - pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frwr->fr_mr, n, mr->mr_nents); + trace_xprtrdma_frwr_maperr(mr, n); rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } -/* Post Send WR containing the RPC Call message. +/** + * frwr_send - post Send WR containing the RPC Call message + * @ia: interface adapter + * @req: Prepared RPC Call * - * For FRMR, chain any FastReg WRs to the Send WR. Only a + * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. + * + * Returns the result of ib_post_send. */ -static int -frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -451,15 +510,18 @@ frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) } /* If ib_post_send fails, the next ->send_request for - * @req will queue these MWs for recovery. + * @req will queue these MRs for recovery. */ return ib_post_send(ia->ri_id->qp, post_wr, NULL); } -/* Handle a remotely invalidated mr on the @mrs list +/** + * frwr_reminv - handle a remotely invalidated mr on the @mrs list + * @rep: Received reply + * @mrs: list of MRs to check + * */ -static void -frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) +void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) { struct rpcrdma_mr *mr; @@ -473,7 +535,10 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) } } -/* Invalidate all memory regions that were registered for "req". +/** + * frwr_unmap_sync - invalidate memory regions that were registered for @req + * @r_xprt: controlling transport + * @mrs: list of MRs to process * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. @@ -481,8 +546,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ -static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) +void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { struct ib_send_wr *first, **prev, *last; const struct ib_send_wr *bad_wr; @@ -561,20 +625,7 @@ out_release: mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - list_del(&mr->mr_list); - frwr_op_release_mr(mr); + list_del_init(&mr->mr_list); + rpcrdma_mr_recycle(mr); } } - -const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { - .ro_map = frwr_op_map, - .ro_send = frwr_op_send, - .ro_reminv = frwr_op_reminv, - .ro_unmap_sync = frwr_op_unmap_sync, - .ro_open = frwr_op_open, - .ro_maxpages = frwr_op_maxpages, - .ro_init_mr = frwr_op_init_mr, - .ro_release_mr = frwr_op_release_mr, - .ro_displayname = "frwr", - .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, -}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9f53e0240035..d18614e02b4e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -218,11 +218,12 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - if (unlikely(!*ppages)) { - /* XXX: Certain upper layer operations do - * not provide receive buffer pages. - */ - *ppages = alloc_page(GFP_ATOMIC); + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. + */ + if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { + if (!*ppages) + *ppages = alloc_page(GFP_ATOMIC); if (!*ppages) return -ENOBUFS; } @@ -356,8 +357,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mr); + seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); r_xprt->rx_stats.read_chunk_count++; nsegs -= mr->mr_nents; } while (nsegs); @@ -414,8 +414,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mr); + seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -423,7 +422,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; @@ -472,8 +471,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mr); + seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -481,7 +479,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; @@ -667,7 +665,7 @@ out_mapping_overflow: out_mapping_err: rpcrdma_unmap_sendctx(sc); - pr_err("rpcrdma: Send mapping error\n"); + trace_xprtrdma_dma_maperr(sge[sge_no].addr); return false; } @@ -1188,17 +1186,20 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", - rqst->rq_task->tk_pid, __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1))); + dprintk("RPC: %s: server reports " + "version error (%u-%u), xid %08x\n", __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1)), + be32_to_cpu(rep->rr_xid)); break; case err_chunk: - dprintk("RPC: %5u: %s: server reports header decoding error\n", - rqst->rq_task->tk_pid, __func__); + dprintk("RPC: %s: server reports " + "header decoding error, xid %08x\n", __func__, + be32_to_cpu(rep->rr_xid)); break; default: - dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + dprintk("RPC: %s: server reports " + "unrecognized error %d, xid %08x\n", __func__, + be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); } r_xprt->rx_stats.bad_reply_count++; @@ -1248,7 +1249,6 @@ out: out_badheader: trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; - status = -EIO; goto out; } @@ -1262,8 +1262,7 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * RPC has relinquished all its Send Queue entries. */ if (!list_empty(&req->rl_registered)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); + frwr_unmap_sync(r_xprt, &req->rl_registered); /* Ensure that any DMA mapped pages associated with * the Send of the RPC Call have been unmapped before @@ -1292,7 +1291,7 @@ void rpcrdma_deferred_completion(struct work_struct *work) trace_xprtrdma_defer_cmp(rep); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) - r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + frwr_reminv(rep, &req->rl_registered); rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } @@ -1312,11 +1311,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; - --buf->rb_posted_receives; - - if (rep->rr_hdrbuf.head[0].iov_len == 0) - goto out_badstatus; - /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); @@ -1356,36 +1350,30 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) } req = rpcr_to_rdmar(rqst); + if (req->rl_reply) { + trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + rpcrdma_recv_buffer_put(req->rl_reply); + } req->rl_reply = rep; rep->rr_rqst = rqst; clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - - rpcrdma_post_recvs(r_xprt, false); - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work(buf->rb_completion_wq, &rep->rr_work); return; out_badversion: trace_xprtrdma_reply_vers(rep); - goto repost; + goto out; -/* The RPC transaction has already been terminated, or the header - * is corrupt. - */ out_norqst: spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); - goto repost; + goto out; out_shortreply: trace_xprtrdma_reply_short(rep); -/* If no pending RPC transaction was matched, post a replacement - * receive buffer before returning. - */ -repost: - rpcrdma_post_recvs(r_xprt, false); -out_badstatus: +out: rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 134bef6a451e..abdb3004a1e3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -235,9 +235,6 @@ void svc_rdma_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - svc_unreg_xprt_class(&svc_rdma_bc_class); -#endif svc_unreg_xprt_class(&svc_rdma_class); } @@ -259,8 +256,5 @@ int svc_rdma_init(void) /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - svc_reg_xprt_class(&svc_rdma_bc_class); -#endif return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index f3c147d70286..b908f2ca08fd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -200,11 +200,10 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) svc_rdma_send_ctxt_put(rdma, ctxt); goto drop_connection; } - return rc; + return 0; drop_connection: dprintk("svcrdma: failed to send bc call\n"); - xprt_disconnect_done(xprt); return -ENOTCONN; } @@ -225,8 +224,11 @@ xprt_rdma_bc_send_request(struct rpc_rqst *rqst) ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_close_xprt(sxprt); + } mutex_unlock(&sxprt->xpt_mutex); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index b24d5b8f2fee..828b149eaaef 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) return p; } +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one R_key to invalidate. + * + * If there is exactly one distinct R_key in the received transport + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. + * + * Perform this operation while the received transport header is + * still in the CPU cache. + */ +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + __be32 inv_rkey, *p; + u32 i, segcount; + + ctxt->rc_inv_rkey = 0; + + if (!rdma->sc_snd_w_inv) + return; + + inv_rkey = xdr_zero; + p = ctxt->rc_recv_buf; + p += rpcrdma_fixed_maxsz; + + /* Read list */ + while (*p++ != xdr_zero) { + p++; /* position */ + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + + /* Write list */ + while (*p++ != xdr_zero) { + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + } + + /* Reply chunk */ + if (*p++ != xdr_zero) { + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + } + + ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); +} + /* On entry, xdr->head[0].iov_base points to first byte in the * RPC-over-RDMA header. * @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; } + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); p += rpcrdma_fixed_maxsz; if (*p != xdr_zero) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 8602a5f1b515..cf51b8f9b15f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp, *reply = NULL; } -/* RPC-over-RDMA Version One private extension: Remote Invalidation. - * Responder's choice: requester signals it can handle Send With - * Invalidate, and responder chooses one rkey to invalidate. - * - * Find a candidate rkey to invalidate when sending a reply. Picks the - * first R_key it finds in the chunk lists. - * - * Returns zero if RPC's chunk lists are empty. - */ -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, - __be32 *wr_lst, __be32 *rp_ch) -{ - __be32 *p; - - p = rdma_argp + rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - p += 2; - else if (wr_lst && be32_to_cpup(wr_lst + 1)) - p = wr_lst + 2; - else if (rp_ch && be32_to_cpup(rp_ch + 1)) - p = rp_ch + 2; - else - return 0; - return be32_to_cpup(p); -} - static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, struct page *page, @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * * RDMA Send is the last step of transmitting an RPC reply. Pages * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the ctxt's page array. These pages are + * of the rqstp and into the sctxt's page array. These pages are * DMA unmapped by each Write completion, but the subsequent Send * completion finally releases these pages. * @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * - The Reply's transport header will never be larger than a page. */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - __be32 *rdma_argp, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp, __be32 *wr_lst, __be32 *rp_ch) { int ret; if (!rp_ch) { - ret = svc_rdma_map_reply_msg(rdma, ctxt, + ret = svc_rdma_map_reply_msg(rdma, sctxt, &rqstp->rq_res, wr_lst); if (ret < 0) return ret; } - svc_rdma_save_io_pages(rqstp, ctxt); + svc_rdma_save_io_pages(rqstp, sctxt); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - if (rdma->sc_snd_w_inv) { - ctxt->sc_send_wr.ex.invalidate_rkey = - svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); - if (ctxt->sc_send_wr.ex.invalidate_rkey) - ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + if (rctxt->rc_inv_rkey) { + sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + } else { + sctxt->sc_send_wr.opcode = IB_WR_SEND; } dprintk("svcrdma: posting Send WR with %u sge(s)\n", - ctxt->sc_send_wr.num_sge); - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + sctxt->sc_send_wr.num_sge); + return svc_rdma_send(rdma, &sctxt->sc_send_wr); } /* Given the client-provided Write and Reply chunks, the server was not @@ -741,10 +714,6 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, return 0; } -void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) -{ -} - /** * svc_rdma_sendto - Transmit an RPC reply * @rqstp: processed RPC request, reply XDR already in ::rq_res @@ -809,7 +778,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); - ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp, + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, wr_lst, rp_ch); if (ret < 0) goto err1; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2f7ec8912f49..924c17d46903 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -85,7 +85,6 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, - .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, .xpo_secure_port = svc_rdma_secure_port, @@ -100,64 +99,6 @@ struct svc_xprt_class svc_rdma_class = { .xcl_ident = XPRT_TRANSPORT_RDMA, }; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, - struct sockaddr *, int, int); -static void svc_rdma_bc_detach(struct svc_xprt *); -static void svc_rdma_bc_free(struct svc_xprt *); - -static const struct svc_xprt_ops svc_rdma_bc_ops = { - .xpo_create = svc_rdma_bc_create, - .xpo_detach = svc_rdma_bc_detach, - .xpo_free = svc_rdma_bc_free, - .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, - .xpo_secure_port = svc_rdma_secure_port, -}; - -struct svc_xprt_class svc_rdma_bc_class = { - .xcl_name = "rdma-bc", - .xcl_owner = THIS_MODULE, - .xcl_ops = &svc_rdma_bc_ops, - .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) -}; - -static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, - struct net *net, - struct sockaddr *sa, int salen, - int flags) -{ - struct svcxprt_rdma *cma_xprt; - struct svc_xprt *xprt; - - cma_xprt = svc_rdma_create_xprt(serv, net); - if (!cma_xprt) - return ERR_PTR(-ENOMEM); - xprt = &cma_xprt->sc_xprt; - - svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); - set_bit(XPT_CONG_CTRL, &xprt->xpt_flags); - serv->sv_bc_xprt = xprt; - - dprintk("svcrdma: %s(%p)\n", __func__, xprt); - return xprt; -} - -static void svc_rdma_bc_detach(struct svc_xprt *xprt) -{ - dprintk("svcrdma: %s(%p)\n", __func__, xprt); -} - -static void svc_rdma_bc_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - dprintk("svcrdma: %s(%p)\n", __func__, xprt); - if (xprt) - kfree(rdma); -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ae2a83828953..fbc171ebfe91 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -268,7 +268,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_inject_dsc(r_xprt); + trace_xprtrdma_op_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } @@ -284,7 +284,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_destroy(r_xprt); + trace_xprtrdma_op_destroy(r_xprt); cancel_delayed_work_sync(&r_xprt->rx_connect_worker); @@ -318,17 +318,12 @@ xprt_setup_rdma(struct xprt_create *args) struct sockaddr *sap; int rc; - if (args->addrlen > sizeof(xprt->addr)) { - dprintk("RPC: %s: address too large\n", __func__); + if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - } xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0); - if (xprt == NULL) { - dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", - __func__); + if (!xprt) return ERR_PTR(-ENOMEM); - } /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; @@ -399,7 +394,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + xprt->max_payload = frwr_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; xprt->max_payload <<= PAGE_SHIFT; @@ -423,7 +418,7 @@ out3: out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: - trace_xprtrdma_destroy(new_xprt); + trace_xprtrdma_op_destroy(new_xprt); xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); @@ -433,29 +428,33 @@ out1: * xprt_rdma_close - close a transport connection * @xprt: transport context * - * Called during transport shutdown, reconnect, or device removal. + * Called during autoclose or device removal. + * * Caller holds @xprt's send lock to prevent activity on this * transport while the connection is torn down. */ -static void -xprt_rdma_close(struct rpc_xprt *xprt) +void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); + might_sleep(); + + trace_xprtrdma_op_close(r_xprt); + + /* Prevent marshaling and sending of new requests */ + xprt_clear_connected(xprt); if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - xprt_clear_connected(xprt); rpcrdma_ia_remove(ia); - return; + goto out; } + if (ep->rep_connected == -ENODEV) return; if (ep->rep_connected > 0) xprt->reestablish_timeout = 0; - xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); /* Prepare @xprt for the next connection by reinitializing @@ -463,6 +462,10 @@ xprt_rdma_close(struct rpc_xprt *xprt) */ r_xprt->rx_buf.rb_credits = 1; xprt->cwnd = RPC_CWNDSHIFT; + +out: + ++xprt->connect_cookie; + xprt_disconnect_done(xprt); } /** @@ -525,6 +528,7 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + trace_xprtrdma_op_connect(r_xprt); if (r_xprt->rx_ep.rep_connected != 0) { /* Reconnect */ schedule_delayed_work(&r_xprt->rx_connect_worker, @@ -659,11 +663,11 @@ xprt_rdma_allocate(struct rpc_task *task) rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; - trace_xprtrdma_allocate(task, req); + trace_xprtrdma_op_allocate(task, req); return 0; out_fail: - trace_xprtrdma_allocate(task, NULL); + trace_xprtrdma_op_allocate(task, NULL); return -ENOMEM; } @@ -682,7 +686,7 @@ xprt_rdma_free(struct rpc_task *task) if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); - trace_xprtrdma_rpc_done(task, req); + trace_xprtrdma_op_free(task, req); } /** @@ -696,8 +700,10 @@ xprt_rdma_free(struct rpc_task *task) * %-ENOTCONN if the caller should reconnect and call again * %-EAGAIN if the caller should call again * %-ENOBUFS if the caller should call again after a delay - * %-EIO if a permanent error occurred and the request was not - * sent. Do not try to send this message again. + * %-EMSGSIZE if encoding ran out of buffer space. The request + * was not sent. Do not try to send this message again. + * %-EIO if an I/O error occurred. The request was not sent. + * Do not try to send this message again. */ static int xprt_rdma_send_request(struct rpc_rqst *rqst) @@ -713,7 +719,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) #endif /* CONFIG_SUNRPC_BACKCHANNEL */ if (!xprt_connected(xprt)) - goto drop_connection; + return -ENOTCONN; if (!xprt_request_get_cong(xprt, rqst)) return -EBADSLT; @@ -745,8 +751,8 @@ failed_marshal: if (rc != -ENOTCONN) return rc; drop_connection: - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ + xprt_rdma_close(xprt); + return -ENOTCONN; } void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -827,7 +833,6 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .inject_disconnect = xprt_rdma_inject_disconnect, #if defined(CONFIG_SUNRPC_BACKCHANNEL) .bc_setup = xprt_rdma_bc_setup, - .bc_up = xprt_rdma_bc_up, .bc_maxpayload = xprt_rdma_bc_maxpayload, .bc_free_rqst = xprt_rdma_bc_free_rqst, .bc_destroy = xprt_rdma_bc_destroy, @@ -844,58 +849,31 @@ static struct xprt_class xprt_rdma = { void xprt_rdma_cleanup(void) { - int rc; - - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } #endif - rc = xprt_unregister_transport(&xprt_rdma); - if (rc) - dprintk("RPC: %s: xprt_unregister returned %i\n", - __func__, rc); - - rpcrdma_destroy_wq(); - rc = xprt_unregister_transport(&xprt_rdma_bc); - if (rc) - dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", - __func__, rc); + xprt_unregister_transport(&xprt_rdma); + xprt_unregister_transport(&xprt_rdma_bc); } int xprt_rdma_init(void) { int rc; - rc = rpcrdma_alloc_wq(); - if (rc) - return rc; - rc = xprt_register_transport(&xprt_rdma); - if (rc) { - rpcrdma_destroy_wq(); + if (rc) return rc; - } rc = xprt_register_transport(&xprt_rdma_bc); if (rc) { xprt_unregister_transport(&xprt_rdma); - rpcrdma_destroy_wq(); return rc; } - dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); - - dprintk("Defaults:\n"); - dprintk("\tSlots %d\n" - "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", - xprt_rdma_slot_table_entries, - xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3ddba94c939f..7749a2bf6887 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -78,53 +78,25 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); +static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); -struct workqueue_struct *rpcrdma_receive_wq __read_mostly; - -int -rpcrdma_alloc_wq(void) -{ - struct workqueue_struct *recv_wq; - - recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_HIGHPRI, - 0); - if (!recv_wq) - return -ENOMEM; - - rpcrdma_receive_wq = recv_wq; - return 0; -} - -void -rpcrdma_destroy_wq(void) -{ - struct workqueue_struct *wq; - - if (rpcrdma_receive_wq) { - wq = rpcrdma_receive_wq; - rpcrdma_receive_wq = NULL; - destroy_workqueue(wq); - } -} - -/** - * rpcrdma_disconnect_worker - Force a disconnect - * @work: endpoint to be disconnected - * - * Provider callbacks can possibly run in an IRQ context. This function - * is invoked in a worker thread to guarantee that disconnect wake-up - * calls are always done in process context. +/* Wait for outstanding transport work to finish. */ -static void -rpcrdma_disconnect_worker(struct work_struct *work) +static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, - rep_disconnect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - xprt_force_disconnect(&r_xprt->rx_xprt); + /* Flush Receives, then wait for deferred Reply work + * to complete. + */ + ib_drain_qp(ia->ri_id->qp); + drain_workqueue(buf->rb_completion_wq); + + /* Deferred Reply processing might have scheduled + * local invalidations. + */ + ib_drain_sq(ia->ri_id->qp); } /** @@ -143,15 +115,6 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) rx_ep); trace_xprtrdma_qp_event(r_xprt, event); - pr_err("rpcrdma: %s on device %s connected to %s:%s\n", - ib_event_msg(event->event), event->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); - - if (ep->rep_connected == 1) { - ep->rep_connected = -EIO; - schedule_delayed_work(&ep->rep_disconnect_worker, 0); - wake_up_all(&ep->rep_connect_wait); - } } /** @@ -189,11 +152,13 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - /* WARNING: Only wr_id and status are reliable at this point */ + /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); + --r_xprt->rx_ep.rep_receive_count; if (wc->status != IB_WC_SUCCESS) - goto out_fail; + goto out_flushed; /* status == SUCCESS means all fields in wc are trustworthy */ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); @@ -204,17 +169,16 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); -out_schedule: + rpcrdma_post_recvs(r_xprt, false); rpcrdma_reply_handler(rep); return; -out_fail: +out_flushed: if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); - goto out_schedule; + rpcrdma_recv_buffer_put(rep); } static void @@ -316,7 +280,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->rep_connected = -EAGAIN; goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->connect_cookie; ep->rep_connected = -ECONNABORTED; disconnected: xprt_force_disconnect(xprt); @@ -326,10 +289,9 @@ disconnected: break; } - dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - rdma_event_msg(event->event)); + ia->ri_device->name, rdma_event_msg(event->event)); return 0; } @@ -347,22 +309,15 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(id)) { - rc = PTR_ERR(id); - dprintk("RPC: %s: rdma_create_id() failed %i\n", - __func__, rc); + if (IS_ERR(id)) return id; - } ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->rx_xprt.addr, RDMA_RESOLVE_TIMEOUT); - if (rc) { - dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", - __func__, rc); + if (rc) goto out; - } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { trace_xprtrdma_conn_tout(xprt); @@ -375,11 +330,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); - if (rc) { - dprintk("RPC: %s: rdma_resolve_route() failed %i\n", - __func__, rc); + if (rc) goto out; - } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { trace_xprtrdma_conn_tout(xprt); @@ -429,16 +381,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRWR: - if (frwr_is_supported(ia)) { - ia->ri_ops = &rpcrdma_frwr_memreg_ops; - break; - } - /*FALLTHROUGH*/ - case RPCRDMA_MTHCAFMR: - if (fmr_is_supported(ia)) { - ia->ri_ops = &rpcrdma_fmr_memreg_ops; + if (frwr_is_supported(ia)) break; - } /*FALLTHROUGH*/ default: pr_err("rpcrdma: Device %s does not support memreg mode %d\n", @@ -481,7 +425,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) * connection is already gone. */ if (ia->ri_id->qp) { - ib_drain_qp(ia->ri_id->qp); + rpcrdma_xprt_drain(r_xprt); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -552,7 +496,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ia->ri_max_send_sges = max_sge; - rc = ia->ri_ops->ro_open(ia, ep, cdata); + rc = frwr_open(ia, ep, cdata); if (rc) return rc; @@ -579,16 +523,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_disconnect_worker, - rpcrdma_disconnect_worker); + ep->rep_receive_count = 0; sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); - dprintk("RPC: %s: failed to create send CQ: %i\n", - __func__, rc); goto out1; } @@ -597,8 +538,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); - dprintk("RPC: %s: failed to create recv CQ: %i\n", - __func__, rc); goto out2; } @@ -611,7 +550,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Prepare RDMA-CM private message */ pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; - pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok; + pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); ep->rep_remote_cma.private_data = pmsg; @@ -653,8 +592,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_disconnect_worker); - if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); @@ -740,11 +677,8 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, } err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); - if (err) { - dprintk("RPC: %s: rdma_create_qp returned %d\n", - __func__, err); + if (err) goto out_destroy; - } /* Atomically replace the transport's ID and QP. */ rc = 0; @@ -775,8 +709,6 @@ retry: dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); rc = -ENETUNREACH; goto out_noupdate; } @@ -798,11 +730,8 @@ retry: rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); - if (rc) { - dprintk("RPC: %s: rdma_connect() failed with %i\n", - __func__, rc); + if (rc) goto out; - } wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { @@ -822,8 +751,10 @@ out_noupdate: return rc; } -/* - * rpcrdma_ep_disconnect +/** + * rpcrdma_ep_disconnect - Disconnect underlying transport + * @ep: endpoint to disconnect + * @ia: associated interface adapter * * This is separate from destroy to facilitate the ability * to reconnect without recreating the endpoint. @@ -834,19 +765,20 @@ out_noupdate: void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); int rc; + /* returns without wait if ID is not connected */ rc = rdma_disconnect(ia->ri_id); if (!rc) - /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); else ep->rep_connected = rc; - trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, - rx_ep), rc); + trace_xprtrdma_disconnect(r_xprt, rc); - ib_drain_qp(ia->ri_id->qp); + rpcrdma_xprt_drain(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -1034,7 +966,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) if (!mr) break; - rc = ia->ri_ops->ro_init_mr(ia, mr); + rc = frwr_init_mr(ia, mr); if (rc) { kfree(mr); break; @@ -1089,9 +1021,9 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) req->rl_buffer = buffer; INIT_LIST_HEAD(&req->rl_registered); - spin_lock(&buffer->rb_reqslock); + spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); - spin_unlock(&buffer->rb_reqslock); + spin_unlock(&buffer->rb_lock); return req; } @@ -1134,8 +1066,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) out_free: kfree(rep); out: - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, rc); return rc; } @@ -1159,7 +1089,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); - spin_lock_init(&buf->rb_reqslock); for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1174,13 +1103,19 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } buf->rb_credits = 1; - buf->rb_posted_receives = 0; INIT_LIST_HEAD(&buf->rb_recv_bufs); rc = rpcrdma_sendctxs_create(r_xprt); if (rc) goto out; + buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s", + WQ_MEM_RECLAIM | WQ_HIGHPRI, + 0, + r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]); + if (!buf->rb_completion_wq) + goto out; + return 0; out: rpcrdma_buffer_destroy(buf); @@ -1194,9 +1129,18 @@ rpcrdma_destroy_rep(struct rpcrdma_rep *rep) kfree(rep); } +/** + * rpcrdma_req_destroy - Destroy an rpcrdma_req object + * @req: unused object to be destroyed + * + * This function assumes that the caller prevents concurrent device + * unload and transport tear-down. + */ void -rpcrdma_destroy_req(struct rpcrdma_req *req) +rpcrdma_req_destroy(struct rpcrdma_req *req) { + list_del(&req->rl_all); + rpcrdma_free_regbuf(req->rl_recvbuf); rpcrdma_free_regbuf(req->rl_sendbuf); rpcrdma_free_regbuf(req->rl_rdmabuf); @@ -1208,7 +1152,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); - struct rpcrdma_ia *ia = rdmab_to_ia(buf); struct rpcrdma_mr *mr; unsigned int count; @@ -1224,7 +1167,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) if (!list_empty(&mr->mr_list)) list_del(&mr->mr_list); - ia->ri_ops->ro_release_mr(mr); + frwr_release_mr(mr); count++; spin_lock(&buf->rb_mrlock); } @@ -1234,11 +1177,24 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) dprintk("RPC: %s: released %u MRs\n", __func__, count); } +/** + * rpcrdma_buffer_destroy - Release all hw resources + * @buf: root control block for resources + * + * ORDERING: relies on a prior ib_drain_qp : + * - No more Send or Receive completions can occur + * - All MRs, reps, and reqs are returned to their free lists + */ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { cancel_delayed_work_sync(&buf->rb_refresh_worker); + if (buf->rb_completion_wq) { + destroy_workqueue(buf->rb_completion_wq); + buf->rb_completion_wq = NULL; + } + rpcrdma_sendctxs_destroy(buf); while (!list_empty(&buf->rb_recv_bufs)) { @@ -1250,19 +1206,14 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_rep(rep); } - spin_lock(&buf->rb_reqslock); - while (!list_empty(&buf->rb_allreqs)) { + while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; - req = list_first_entry(&buf->rb_allreqs, - struct rpcrdma_req, rl_all); - list_del(&req->rl_all); - - spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(req); - spin_lock(&buf->rb_reqslock); + req = list_first_entry(&buf->rb_send_bufs, + struct rpcrdma_req, rl_list); + list_del(&req->rl_list); + rpcrdma_req_destroy(req); } - spin_unlock(&buf->rb_reqslock); rpcrdma_mrs_destroy(buf); } @@ -1329,9 +1280,12 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); + if (mr->mr_dir != DMA_NONE) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + mr->mr_dir = DMA_NONE; + } __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } @@ -1410,7 +1364,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls - * or Replies they may be registered externally via ro_map. + * or Replies they may be registered externally via frwr_map. */ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, @@ -1446,8 +1400,10 @@ __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) (void *)rb->rg_base, rdmab_length(rb), rb->rg_direction); - if (ib_dma_mapping_error(device, rdmab_addr(rb))) + if (ib_dma_mapping_error(device, rdmab_addr(rb))) { + trace_xprtrdma_dma_maperr(rdmab_addr(rb)); return false; + } rb->rg_device = device; rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; @@ -1479,10 +1435,14 @@ rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) kfree(rb); } -/* - * Prepost any receive buffer, then post send. +/** + * rpcrdma_ep_post - Post WRs to a transport's Send Queue + * @ia: transport's device information + * @ep: transport's RDMA endpoint information + * @req: rpcrdma_req containing the Send WR to post * - * Receive buffer is donated to hardware, reclaimed upon recv completion. + * Returns 0 if the post was successful, otherwise -ENOTCONN + * is returned. */ int rpcrdma_ep_post(struct rpcrdma_ia *ia, @@ -1501,32 +1461,27 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = ia->ri_ops->ro_send(ia, req); + rc = frwr_send(ia, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; return 0; } -/** - * rpcrdma_post_recvs - Maybe post some Receive buffers - * @r_xprt: controlling transport - * @temp: when true, allocate temp rpcrdma_rep objects - * - */ -void +static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; rc = 0; count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (buf->rb_posted_receives > needed) + if (ep->rep_receive_count > needed) goto out; - needed -= buf->rb_posted_receives; + needed -= ep->rep_receive_count; count = 0; wr = NULL; @@ -1574,7 +1529,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --count; } } - buf->rb_posted_receives += count; + ep->rep_receive_count += count; out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a13ccb643ce0..5a18472f2c9c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -66,7 +66,6 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { - const struct rpcrdma_memreg_ops *ri_ops; struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -81,8 +80,6 @@ struct rpcrdma_ia { bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; - struct ib_qp_attr ri_qp_attr; - struct ib_qp_init_attr ri_qp_init_attr; }; enum { @@ -101,7 +98,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_disconnect_worker; + int rep_receive_count; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -262,20 +259,12 @@ struct rpcrdma_frwr { }; }; -struct rpcrdma_fmr { - struct ib_fmr *fm_mr; - u64 *fm_physaddrs; -}; - struct rpcrdma_mr { struct list_head mr_list; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; - union { - struct rpcrdma_fmr fmr; - struct rpcrdma_frwr frwr; - }; + struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; @@ -401,20 +390,18 @@ struct rpcrdma_buffer { spinlock_t rb_lock; /* protect buf lists */ struct list_head rb_send_bufs; struct list_head rb_recv_bufs; + struct list_head rb_allreqs; + unsigned long rb_flags; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ - int rb_posted_receives; u32 rb_bc_srv_max_requests; - spinlock_t rb_reqslock; /* protect rb_allreqs */ - struct list_head rb_allreqs; - u32 rb_bc_max_requests; + struct workqueue_struct *rb_completion_wq; struct delayed_work rb_refresh_worker; }; -#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) /* rb_flags */ enum { @@ -465,35 +452,6 @@ struct rpcrdma_stats { }; /* - * Per-registration mode operations - */ -struct rpcrdma_xprt; -struct rpcrdma_memreg_ops { - struct rpcrdma_mr_seg * - (*ro_map)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *, int, bool, - struct rpcrdma_mr **); - int (*ro_send)(struct rpcrdma_ia *ia, - struct rpcrdma_req *req); - void (*ro_reminv)(struct rpcrdma_rep *rep, - struct list_head *mrs); - void (*ro_unmap_sync)(struct rpcrdma_xprt *, - struct list_head *); - int (*ro_open)(struct rpcrdma_ia *, - struct rpcrdma_ep *, - struct rpcrdma_create_data_internal *); - size_t (*ro_maxpages)(struct rpcrdma_xprt *); - int (*ro_init_mr)(struct rpcrdma_ia *, - struct rpcrdma_mr *); - void (*ro_release_mr)(struct rpcrdma_mr *mr); - const char *ro_displayname; - const int ro_send_w_inv_ok; -}; - -extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; -extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; - -/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -544,10 +502,6 @@ extern unsigned int xprt_rdma_memreg_strategy; int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); -bool frwr_is_supported(struct rpcrdma_ia *); -bool fmr_is_supported(struct rpcrdma_ia *); - -extern struct workqueue_struct *rpcrdma_receive_wq; /* * Endpoint calls - xprtrdma/verbs.c @@ -560,13 +514,12 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -void rpcrdma_destroy_req(struct rpcrdma_req *); +void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); @@ -604,9 +557,6 @@ rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) return __rpcrdma_dma_map_regbuf(ia, rb); } -int rpcrdma_alloc_wq(void); -void rpcrdma_destroy_wq(void); - /* * Wrappers for chunk registration, shared by read/write chunk code. */ @@ -617,6 +567,23 @@ rpcrdma_data_dir(bool writing) return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; } +/* Memory registration calls xprtrdma/frwr_ops.c + */ +bool frwr_is_supported(struct rpcrdma_ia *); +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata); +int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); +void frwr_release_mr(struct rpcrdma_mr *mr); +size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); +struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, u32 xid, + struct rpcrdma_mr **mr); +int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); +void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, + struct list_head *mrs); + /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ @@ -653,6 +620,7 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_close(struct rpc_xprt *xprt); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -661,7 +629,6 @@ void xprt_rdma_cleanup(void); */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); -int xprt_rdma_bc_up(struct svc_serv *, struct net *); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f0b3700cec95..13559e6a460b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -68,8 +68,6 @@ static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - #define XS_TCP_LINGER_TO (15U * HZ) static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; @@ -159,8 +157,6 @@ static struct ctl_table sunrpc_table[] = { { }, }; -#endif - /* * Wait duration for a reply from the RPC portmapper. */ @@ -1400,17 +1396,6 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) } #if defined(CONFIG_SUNRPC_BACKCHANNEL) -static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) -{ - int ret; - - ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, - SVC_SOCK_ANONYMOUS); - if (ret < 0) - return ret; - return 0; -} - static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; @@ -1600,6 +1585,7 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t /** * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @xprt: controlling transport * @task: task that timed out * * Adjust the congestion window after a retransmit timeout has occurred. @@ -2257,6 +2243,7 @@ out: /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint + * @work: queued work item * * Invoked by a work queue tasklet. */ @@ -2665,7 +2652,6 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .inject_disconnect = xs_inject_disconnect, #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, - .bc_up = xs_tcp_bc_up, .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, @@ -3107,10 +3093,8 @@ static struct xprt_class xs_bc_tcp_transport = { */ int init_socket_xprt(void) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); -#endif xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); @@ -3126,12 +3110,10 @@ int init_socket_xprt(void) */ void cleanup_socket_xprt(void) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } -#endif xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index fb2c0d8f359f..d27f30a9a01d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -319,7 +319,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); - kfree(b); errstr = "failed to create discoverer"; goto rejected; } diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 21f6ccc89401..40f5cae623a7 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -904,6 +904,8 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); + if (!hdr) + return -EMSGSIZE; nest = nla_nest_start(args, TIPC_NLA_SOCK); if (!nest) { |