From 675716400da6f15b9d3db04ef74ee74ca9a00af3 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 4 Jul 2019 17:25:03 +0300 Subject: xdp: fix possible cq entry leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completion queue address reservation could not be undone. In case of bad 'queue_id' or skb allocation failure, reserved entry will be leaked reducing the total capacity of completion queue. Fix that by moving reservation to the point where failure is not possible. Additionally, 'queue_id' checking moved out from the loop since there is no point to check it there. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Ilya Maximets Acked-by: Björn Töpel Tested-by: William Tu Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d4d6f10aa936..b994c32a664a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -240,6 +240,9 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, mutex_lock(&xs->mutex); + if (xs->queue_id >= xs->dev->real_num_tx_queues) + goto out; + while (xskq_peek_desc(xs->tx, &desc)) { char *buffer; u64 addr; @@ -250,12 +253,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xskq_reserve_addr(xs->umem->cq)) - goto out; - - if (xs->queue_id >= xs->dev->real_num_tx_queues) - goto out; - len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { @@ -267,7 +264,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, addr = desc.addr; buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { + if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) { kfree_skb(skb); goto out; } -- cgit v1.2.3 From 5464c3a0e9a037b63d5229cdea08dddc01a98aac Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Mon, 8 Jul 2019 14:03:44 +0300 Subject: xdp: fix potential deadlock on socket mutex There are 2 call chains: a) xsk_bind --> xdp_umem_assign_dev b) unregister_netdevice_queue --> xsk_notifier with the following locking order: a) xs->mutex --> rtnl_lock b) rtnl_lock --> xdp.lock --> xs->mutex Different order of taking 'xs->mutex' and 'rtnl_lock' could produce a deadlock here. Fix that by moving the 'rtnl_lock' before 'xs->lock' in the bind call chain (a). Reported-by: syzbot+bf64ec93de836d7f4c2c@syzkaller.appspotmail.com Fixes: 455302d1c9ae ("xdp: fix hang while unregistering device bound to xdp socket") Signed-off-by: Ilya Maximets Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 16 ++++++---------- net/xdp/xsk.c | 2 ++ 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 20c91f02d3d8..83de74ca729a 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -87,21 +87,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, struct netdev_bpf bpf; int err = 0; + ASSERT_RTNL(); + force_zc = flags & XDP_ZEROCOPY; force_copy = flags & XDP_COPY; if (force_zc && force_copy) return -EINVAL; - rtnl_lock(); - if (xdp_get_umem_from_qid(dev, queue_id)) { - err = -EBUSY; - goto out_rtnl_unlock; - } + if (xdp_get_umem_from_qid(dev, queue_id)) + return -EBUSY; err = xdp_reg_umem_at_qid(dev, umem, queue_id); if (err) - goto out_rtnl_unlock; + return err; umem->dev = dev; umem->queue_id = queue_id; @@ -110,7 +109,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_copy) /* For copy-mode, we are done. */ - goto out_rtnl_unlock; + return 0; if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) { @@ -125,7 +124,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, err = dev->netdev_ops->ndo_bpf(dev, &bpf); if (err) goto err_unreg_umem; - rtnl_unlock(); umem->zc = true; return 0; @@ -135,8 +133,6 @@ err_unreg_umem: err = 0; /* fallback to copy mode */ if (err) xdp_clear_umem_at_qid(dev, queue_id); -out_rtnl_unlock: - rtnl_unlock(); return err; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b994c32a664a..59b57d708697 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -430,6 +430,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY)) return -EINVAL; + rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { err = -EBUSY; @@ -515,6 +516,7 @@ out_unlock: xs->state = XSK_BOUND; out_release: mutex_unlock(&xs->mutex); + rtnl_unlock(); return err; } -- cgit v1.2.3 From 6e3d1bbbba55265d327e44a2d14de70462232853 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 14 Jul 2019 21:31:22 +0800 Subject: sit: use dst_cache in ipip6_tunnel_xmit Same as other ip tunnel, use dst_cache in xmit action to avoid unnecessary fib lookups. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/sit.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 80610899a323..b2ccbc473127 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -900,12 +900,17 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); - rt = ip_route_output_flow(tunnel->net, &fl4, NULL); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; + rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); + if (!rt) { + rt = ip_route_output_flow(tunnel->net, &fl4, NULL); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } + if (rt->rt_type != RTN_UNICAST) { ip_rt_put(rt); dev->stats.tx_carrier_errors++; -- cgit v1.2.3 From 071c37983d99da07797294ea78e9da1a6e287144 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 14 Jul 2019 23:36:11 +0200 Subject: net: neigh: fix multiple neigh timer scheduling Neigh timer can be scheduled multiple times from userspace adding multiple neigh entries and forcing the neigh timer scheduling passing NTF_USE in the netlink requests. This will result in a refcount leak and in the following dump stack: [ 32.465295] NEIGH: BUG, double timer add, state is 8 [ 32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65 [ 32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014 [ 32.465313] Call Trace: [ 32.465318] dump_stack+0x7c/0xc0 [ 32.465323] __neigh_event_send+0x20c/0x880 [ 32.465326] ? ___neigh_create+0x846/0xfb0 [ 32.465329] ? neigh_lookup+0x2a9/0x410 [ 32.465332] ? neightbl_fill_info.constprop.0+0x800/0x800 [ 32.465334] neigh_add+0x4f8/0x5e0 [ 32.465337] ? neigh_xmit+0x620/0x620 [ 32.465341] ? find_held_lock+0x85/0xa0 [ 32.465345] rtnetlink_rcv_msg+0x204/0x570 [ 32.465348] ? rtnl_dellink+0x450/0x450 [ 32.465351] ? mark_held_locks+0x90/0x90 [ 32.465354] ? match_held_lock+0x1b/0x230 [ 32.465357] netlink_rcv_skb+0xc4/0x1d0 [ 32.465360] ? rtnl_dellink+0x450/0x450 [ 32.465363] ? netlink_ack+0x420/0x420 [ 32.465366] ? netlink_deliver_tap+0x115/0x560 [ 32.465369] ? __alloc_skb+0xc9/0x2f0 [ 32.465372] netlink_unicast+0x270/0x330 [ 32.465375] ? netlink_attachskb+0x2f0/0x2f0 [ 32.465378] netlink_sendmsg+0x34f/0x5a0 [ 32.465381] ? netlink_unicast+0x330/0x330 [ 32.465385] ? move_addr_to_kernel.part.0+0x20/0x20 [ 32.465388] ? netlink_unicast+0x330/0x330 [ 32.465391] sock_sendmsg+0x91/0xa0 [ 32.465394] ___sys_sendmsg+0x407/0x480 [ 32.465397] ? copy_msghdr_from_user+0x200/0x200 [ 32.465401] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 32.465404] ? lockdep_hardirqs_on+0x17d/0x250 [ 32.465407] ? __wake_up_common_lock+0xcb/0x110 [ 32.465410] ? __wake_up_common+0x230/0x230 [ 32.465413] ? netlink_bind+0x3e1/0x490 [ 32.465416] ? netlink_setsockopt+0x540/0x540 [ 32.465420] ? __fget_light+0x9c/0xf0 [ 32.465423] ? sockfd_lookup_light+0x8c/0xb0 [ 32.465426] __sys_sendmsg+0xa5/0x110 [ 32.465429] ? __ia32_sys_shutdown+0x30/0x30 [ 32.465432] ? __fd_install+0xe1/0x2c0 [ 32.465435] ? lockdep_hardirqs_off+0xb5/0x100 [ 32.465438] ? mark_held_locks+0x24/0x90 [ 32.465441] ? do_syscall_64+0xf/0x270 [ 32.465444] do_syscall_64+0x63/0x270 [ 32.465448] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER' receiving a netlink request with NTF_USE flag set Reported-by: Marek Majkowski Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table") Signed-off-by: Lorenzo Bianconi Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 742cea4ce72e..0dfc97bc8760 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1124,6 +1124,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh_del_timer(neigh); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), @@ -1140,6 +1141,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); + neigh_del_timer(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + -- cgit v1.2.3 From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:52 -0700 Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate that it can be used for both loads and stores. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- net/core/filter.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6d944369ca87..ff65d22cf336 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) return size <= size_default && (size & (size - 1)) == 0; } -#define bpf_ctx_wide_store_ok(off, size, type, field) \ +#define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ diff --git a/net/core/filter.c b/net/core/filter.c index 47f6386fb17a..c5983ddb1a9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size, if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - user_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) return true; - if (bpf_ctx_wide_store_ok(off, size, - struct bpf_sock_addr, - msg_src_ip6)) + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) return true; if (size != size_default) -- cgit v1.2.3 From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Jul 2019 09:39:53 -0700 Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and msg_src_ip6 Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and update the comment. Cc: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6f68438aa4ed..81be929b89fc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3248,7 +3248,7 @@ struct bpf_sock_addr { __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __u32 user_port; /* Allows 4-byte read and write. @@ -3260,7 +3260,7 @@ struct bpf_sock_addr { __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order. */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read and 4,8-byte write. + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ __bpf_md_ptr(struct bpf_sock *, sk); diff --git a/net/core/filter.c b/net/core/filter.c index c5983ddb1a9f..0f6854ccf894 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): - /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + user_ip6)) + return true; + + if (bpf_ctx_wide_access_ok(off, size, + struct bpf_sock_addr, + msg_src_ip6)) + return true; + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { -- cgit v1.2.3 From db8051f30fbab7f579d691137f1e23f3bb1ac2eb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 16 Jul 2019 11:43:05 -0400 Subject: skbuff: fix compilation warnings in skb_dump() The commit 6413139dfc64 ("skbuff: increase verbosity when dumping skb data") introduced a few compilation warnings. net/core/skbuff.c:766:32: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] level, sk->sk_family, sk->sk_type, sk->sk_protocol); ^~~~~~~~~~~ net/core/skbuff.c:766:45: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] level, sk->sk_family, sk->sk_type, sk->sk_protocol); ^~~~~~~~~~~~~~~ Fix them by using the proper types. Fixes: 6413139dfc64 ("skbuff: increase verbosity when dumping skb data") Signed-off-by: Qian Cai Reviewed-by: Nathan Chancellor Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6f1e31f674a3..0338820ee0ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -762,7 +762,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) printk("%sdev name=%s feat=0x%pNF\n", level, dev->name, &dev->features); if (sk) - printk("%ssk family=%hu type=%hu proto=%hu\n", + printk("%ssk family=%hu type=%u proto=%u\n", level, sk->sk_family, sk->sk_type, sk->sk_protocol); if (full_pkt && headroom) -- cgit v1.2.3 From a5b647007e9d794956dbed9339a3354a9fc4d5c3 Mon Sep 17 00:00:00 2001 From: Vedang Patel Date: Tue, 16 Jul 2019 12:52:18 -0700 Subject: fix: taprio: Change type of txtime-delay parameter to u32 During the review of the iproute2 patches for txtime-assist mode, it was pointed out that it does not make sense for the txtime-delay parameter to be negative. So, change the type of the parameter from s32 to u32. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Reported-by: Stephen Hemminger Signed-off-by: Vedang Patel Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 +- net/sched/sch_taprio.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 1f623252abe8..18f185299f47 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1174,7 +1174,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ TCA_TAPRIO_ATTR_FLAGS, /* u32 */ - TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */ + TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */ __TCA_TAPRIO_ATTR_MAX, }; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 388750ddc57a..c39db507ba3f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -75,7 +75,7 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - int txtime_delay; + u32 txtime_delay; }; static ktime_t sched_base_time(const struct sched_gate_list *sched) @@ -1113,7 +1113,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); + q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && @@ -1430,7 +1430,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) goto options_error; if (q->txtime_delay && - nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) + nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; if (oper && dump_schedule(skb, oper)) -- cgit v1.2.3 From 86fda90ab5888d983f7307442ba62978e3504bd3 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 16 Jul 2019 07:50:02 +0530 Subject: net: sctp: fix warning "NULL check before some freeing functions is not needed" This patch removes NULL checks before calling kfree. fixes below issues reported by coccicheck net/sctp/sm_make_chunk.c:2586:3-8: WARNING: NULL check before some freeing functions is not needed. net/sctp/sm_make_chunk.c:2652:3-8: WARNING: NULL check before some freeing functions is not needed. net/sctp/sm_make_chunk.c:2667:3-8: WARNING: NULL check before some freeing functions is not needed. net/sctp/sm_make_chunk.c:2684:3-8: WARNING: NULL check before some freeing functions is not needed. Signed-off-by: Hariprasad Kelam Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ed39396b9bba..36bd8a6e82df 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2582,8 +2582,7 @@ do_addr_param: case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); - if (asoc->peer.cookie) - kfree(asoc->peer.cookie); + kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; @@ -2648,8 +2647,7 @@ do_addr_param: goto fall_through; /* Save peer's random parameter */ - if (asoc->peer.peer_random) - kfree(asoc->peer.peer_random); + kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { @@ -2663,8 +2661,7 @@ do_addr_param: goto fall_through; /* Save peer's HMAC list */ - if (asoc->peer.peer_hmacs) - kfree(asoc->peer.peer_hmacs); + kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { @@ -2680,8 +2677,7 @@ do_addr_param: if (!ep->auth_enable) goto fall_through; - if (asoc->peer.peer_chunks) - kfree(asoc->peer.peer_chunks); + kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) -- cgit v1.2.3 From f11fe1dae1c46762a9a7b0dd142b6bfebe7783ff Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 16 Jul 2019 15:16:02 +0800 Subject: net/sched: Make NET_ACT_CT depends on NF_NAT If NF_NAT is m and NET_ACT_CT is y, build fails: net/sched/act_ct.o: In function `tcf_ct_act': act_ct.c:(.text+0x21ac): undefined reference to `nf_ct_nat_ext_add' act_ct.c:(.text+0x229a): undefined reference to `nf_nat_icmp_reply_translation' act_ct.c:(.text+0x233a): undefined reference to `nf_nat_setup_info' act_ct.c:(.text+0x234a): undefined reference to `nf_nat_alloc_null_binding' act_ct.c:(.text+0x237c): undefined reference to `nf_nat_packet' Reported-by: Hulk Robot Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index dd55b9ac3a66..afd2ba157a13 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -942,7 +942,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" - depends on NET_CLS_ACT && NF_CONNTRACK + depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT help Say Y here to allow sending the packets to conntrack module. -- cgit v1.2.3 From 2c7da8e6b041a8df2661def81ac90c9c0c719909 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:28:51 -0700 Subject: net/rds: Give fr_state a chance to transition to FRMR_IS_FREE In the context of FRMR (ib_frmr.c): Memory regions make it onto the "clean_list" via "rds_ib_flush_mr_pool", after the memory region has been posted for invalidation via "rds_ib_post_inv". At that point in time, "fr_state" may still be in state "FRMR_IS_INUSE", since the only place where "fr_state" transitions to "FRMR_IS_FREE" is in "rds_ib_mr_cqe_handler", which is triggered by a tasklet. So in case we notice that "fr_state != FRMR_IS_FREE" (see below), we wait for "fr_inv_done" to trigger with a maximum of 10msec. Then we check again, and only put the memory region onto the drop_list (via "rds_ib_free_frmr") in case the situation remains unchanged. This avoids the problem of memory-regions bouncing between "clean_list" and "drop_list" before they even have a chance to be properly invalidated. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_frmr.c | 27 ++++++++++++++++++++++++++- net/rds/ib_mr.h | 1 + 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 32ae26ed58a0..6038138d6e38 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -75,6 +75,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, pool->max_items_soft = pool->max_items; frmr->fr_state = FRMR_IS_FREE; + init_waitqueue_head(&frmr->fr_inv_done); return ibmr; out_no_cigar: @@ -285,6 +286,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; + wake_up(&frmr->fr_inv_done); } atomic_inc(&ic->i_fastreg_wrs); @@ -345,8 +347,31 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, } do { - if (ibmr) + if (ibmr) { + /* Memory regions make it onto the "clean_list" via + * "rds_ib_flush_mr_pool", after the memory region has + * been posted for invalidation via "rds_ib_post_inv". + * + * At that point in time, "fr_state" may still be + * in state "FRMR_IS_INUSE", since the only place where + * "fr_state" transitions to "FRMR_IS_FREE" is in + * is in "rds_ib_mr_cqe_handler", which is + * triggered by a tasklet. + * + * So we wait for "fr_inv_done" to trigger + * and only put memory regions onto the drop_list + * that failed (i.e. not marked "FRMR_IS_FREE"). + * + * This avoids the problem of memory-regions bouncing + * between "clean_list" and "drop_list" before they + * even have a chance to be properly invalidated. + */ + frmr = &ibmr->u.frmr; + wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE); + if (frmr->fr_state == FRMR_IS_FREE) + break; rds_ib_free_frmr(ibmr, true); + } ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); if (IS_ERR(ibmr)) return ibmr; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 5da12c248431..42daccb7b5eb 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -57,6 +57,7 @@ struct rds_ib_frmr { struct ib_mr *mr; enum rds_ib_fr_state fr_state; bool fr_inv; + wait_queue_head_t fr_inv_done; struct ib_send_wr fr_wr; unsigned int dma_npages; unsigned int sg_byte_len; -- cgit v1.2.3 From c9467447fc50ec3715d8ec98f4da874fce539235 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:28:57 -0700 Subject: net/rds: Get rid of "wait_clean_list_grace" and add locking Waiting for activity on the "clean_list" to quiesce is no substitute for proper locking. We can have multiple threads competing for "llist_del_first" via "rds_ib_reuse_mr", and a single thread competing for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool". Since "llist_del_first" depends on "list->first->next" not to change in the midst of the operation, simply waiting for all current calls to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate: By the time "wait_clean_list_grace" is done iterating over all CPUs to see that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may have just shown up on the first CPU. Furthermore, explicitly calls out the need for locking: * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, * and llist_del_first or llist_del_all used in other consumers, * then a lock is needed. Also, while at it, drop the unused "pool" parameter from "list_to_llist_nodes". Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_mr.h | 1 + net/rds/ib_rdma.c | 56 ++++++++++++++++++------------------------------------- 2 files changed, 19 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 42daccb7b5eb..ab26c20ed66f 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -98,6 +98,7 @@ struct rds_ib_mr_pool { struct llist_head free_list; /* unused MRs */ struct llist_head clean_list; /* unused & unmapped MRs */ wait_queue_head_t flush_wait; + spinlock_t clean_lock; /* "clean_list" concurrency */ atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0b347f46b2f4..6b047e63a769 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -40,9 +40,6 @@ struct workqueue_struct *rds_ib_mr_wq; -static DEFINE_PER_CPU(unsigned long, clean_list_grace); -#define CLEAN_LIST_BUSY_BIT 0 - static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; @@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; - unsigned long *flag; + unsigned long flags; - preempt_disable(); - flag = this_cpu_ptr(&clean_list_grace); - set_bit(CLEAN_LIST_BUSY_BIT, flag); + spin_lock_irqsave(&pool->clean_lock, flags); ret = llist_del_first(&pool->clean_list); + spin_unlock_irqrestore(&pool->clean_lock, flags); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) @@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } - clear_bit(CLEAN_LIST_BUSY_BIT, flag); - preempt_enable(); return ibmr; } -static inline void wait_clean_list_grace(void) -{ - int cpu; - unsigned long *flag; - - for_each_online_cpu(cpu) { - flag = &per_cpu(clean_list_grace, cpu); - while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) - cpu_relax(); - } -} - void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; @@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist, * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ -static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, - struct list_head *list, +static void list_to_llist_nodes(struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { @@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); - if (free_all) + if (free_all) { + unsigned long flags; + + spin_lock_irqsave(&pool->clean_lock, flags); llist_append_to_list(&pool->clean_list, &unmap_list); + spin_unlock_irqrestore(&pool->clean_lock, flags); + } free_goal = rds_ib_flush_goal(pool, free_all); @@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { - /* we have to make sure that none of the things we're about - * to put on the clean list would race with other cpus trying - * to pull items off. The llist would explode if we managed to - * remove something from the clean list and then add it back again - * while another CPU was spinning on that same item in llist_del_first. - * - * This is pretty unlikely, but just in case wait for an llist grace period - * here before adding anything back into the clean list. - */ - wait_clean_list_grace(); - - list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); + unsigned long flags; + + list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ - if (clean_nodes) + if (clean_nodes) { + spin_lock_irqsave(&pool->clean_lock, flags); llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); - + spin_unlock_irqrestore(&pool->clean_lock, flags); + } } atomic_sub(unpinned, &pool->free_pinned); @@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); + spin_lock_init(&pool->clean_lock); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); -- cgit v1.2.3 From 5f33141d2fc05a2d2134ba0e7b47ce4aa88340f0 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:29:02 -0700 Subject: net/rds: Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition after posting IB_WR_LOCAL_INV In order to: 1) avoid a silly bouncing between "clean_list" and "drop_list" triggered by function "rds_ib_reg_frmr" as it is releases frmr regions whose state is not "FRMR_IS_FREE" right away. 2) prevent an invalid access error in a race from a pending "IB_WR_LOCAL_INV" operation with a teardown ("dma_unmap_sg", "put_page") and de-registration ("ib_dereg_mr") of the corresponding memory region. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_frmr.c | 65 ++++++++++++++++++++++++++++++++----------------------- net/rds/ib_mr.h | 2 ++ 2 files changed, 40 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 6038138d6e38..708c553c3da5 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -76,6 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, frmr->fr_state = FRMR_IS_FREE; init_waitqueue_head(&frmr->fr_inv_done); + init_waitqueue_head(&frmr->fr_reg_done); return ibmr; out_no_cigar: @@ -124,6 +125,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); frmr->fr_state = FRMR_IS_INUSE; + frmr->fr_reg = true; memset(®_wr, 0, sizeof(reg_wr)); reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; @@ -144,7 +146,17 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); + goto out; } + + /* Wait for the registration to complete in order to prevent an invalid + * access error resulting from a race between the memory region already + * being accessed while registration is still pending. + */ + wait_event(frmr->fr_reg_done, !frmr->fr_reg); + +out: + return ret; } @@ -262,6 +274,19 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; } + + /* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to + * 1) avoid a silly bouncing between "clean_list" and "drop_list" + * triggered by function "rds_ib_reg_frmr" as it is releases frmr + * regions whose state is not "FRMR_IS_FREE" right away. + * 2) prevents an invalid access error in a race + * from a pending "IB_WR_LOCAL_INV" operation + * with a teardown ("dma_unmap_sg", "put_page") + * and de-registration ("ib_dereg_mr") of the corresponding + * memory region. + */ + wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE); + out: return ret; } @@ -289,6 +314,11 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) wake_up(&frmr->fr_inv_done); } + if (frmr->fr_reg) { + frmr->fr_reg = false; + wake_up(&frmr->fr_reg_done); + } + atomic_inc(&ic->i_fastreg_wrs); } @@ -297,14 +327,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, { struct rds_ib_mr *ibmr, *next; struct rds_ib_frmr *frmr; - int ret = 0; + int ret = 0, ret2; unsigned int freed = *nfreed; /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ list_for_each_entry(ibmr, list, unmap_list) { - if (ibmr->sg_dma_len) - ret |= rds_ib_post_inv(ibmr); + if (ibmr->sg_dma_len) { + ret2 = rds_ib_post_inv(ibmr); + if (ret2 && !ret) + ret = ret2; + } } + if (ret) pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret); @@ -347,31 +381,8 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, } do { - if (ibmr) { - /* Memory regions make it onto the "clean_list" via - * "rds_ib_flush_mr_pool", after the memory region has - * been posted for invalidation via "rds_ib_post_inv". - * - * At that point in time, "fr_state" may still be - * in state "FRMR_IS_INUSE", since the only place where - * "fr_state" transitions to "FRMR_IS_FREE" is in - * is in "rds_ib_mr_cqe_handler", which is - * triggered by a tasklet. - * - * So we wait for "fr_inv_done" to trigger - * and only put memory regions onto the drop_list - * that failed (i.e. not marked "FRMR_IS_FREE"). - * - * This avoids the problem of memory-regions bouncing - * between "clean_list" and "drop_list" before they - * even have a chance to be properly invalidated. - */ - frmr = &ibmr->u.frmr; - wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE); - if (frmr->fr_state == FRMR_IS_FREE) - break; + if (ibmr) rds_ib_free_frmr(ibmr, true); - } ibmr = rds_ib_alloc_frmr(rds_ibdev, nents); if (IS_ERR(ibmr)) return ibmr; diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index ab26c20ed66f..9045a8c0edff 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -58,6 +58,8 @@ struct rds_ib_frmr { enum rds_ib_fr_state fr_state; bool fr_inv; wait_queue_head_t fr_inv_done; + bool fr_reg; + wait_queue_head_t fr_reg_done; struct ib_send_wr fr_wr; unsigned int dma_npages; unsigned int sg_byte_len; -- cgit v1.2.3 From aea01a2234d26ffa9d9ee01e43705824c0c7b08a Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:29:07 -0700 Subject: net/rds: Fix NULL/ERR_PTR inconsistency Make function "rds_ib_try_reuse_ibmr" return NULL in case memory region could not be allocated, since callers simply check if the return value is not NULL. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 6b047e63a769..c8c1e3ae8d84 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -450,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); - return ERR_PTR(-EAGAIN); + break; } /* We do have some empty MRs. Flush them out. */ @@ -464,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) return ibmr; } - return ibmr; + return NULL; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) -- cgit v1.2.3 From 9547dff1085d5935d6070377023096821033e30c Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:29:12 -0700 Subject: net/rds: Set fr_state only to FRMR_IS_FREE if IB_WR_LOCAL_INV had been successful Fix a bug where fr_state first goes to FRMR_IS_STALE, because of a failure of operation IB_WR_LOCAL_INV, but then gets set back to "FRMR_IS_FREE" uncoditionally, even though the operation failed. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_frmr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 708c553c3da5..adaa8e99e5a9 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -309,7 +309,8 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) } if (frmr->fr_inv) { - frmr->fr_state = FRMR_IS_FREE; + if (frmr->fr_state == FRMR_IS_INUSE) + frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; wake_up(&frmr->fr_inv_done); } -- cgit v1.2.3 From 3a2886cca703fde5ee21baea9fedf8b1389c59d7 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:29:17 -0700 Subject: net/rds: Keep track of and wait for FRWR segments in use upon shutdown Since "rds_ib_free_frmr" and "rds_ib_free_frmr_list" simply put the FRMR memory segments on the "drop_list" or "free_list", and it is the job of "rds_ib_flush_mr_pool" to reap those entries by ultimately issuing a "IB_WR_LOCAL_INV" work-request, we need to trigger and then wait for all those memory segments attached to a particular connection to be fully released before we can move on to release the QP, CQ, etc. So we make "rds_ib_conn_path_shutdown" wait for one more atomic_t called "i_fastreg_inuse_count" that keeps track of how many FRWR memory segments are out there marked "FRMR_IS_INUSE" (and also wake_up rds_ib_ring_empty_wait, as they go away). Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.h | 1 + net/rds/ib_cm.c | 7 +++++++ net/rds/ib_frmr.c | 43 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 66c03c7665b2..303c6ee8bdb7 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -156,6 +156,7 @@ struct rds_ib_connection { /* To control the number of wrs from fastreg */ atomic_t i_fastreg_wrs; + atomic_t i_fastreg_inuse_count; /* interrupt handling */ struct tasklet_struct i_send_tasklet; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 8891822eba4f..1b6fd6c8b12b 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -40,6 +40,7 @@ #include "rds_single_path.h" #include "rds.h" #include "ib.h" +#include "ib_mr.h" /* * Set the selected protocol version @@ -993,6 +994,11 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_cm_id, err); } + /* kick off "flush_worker" for all pools in order to reap + * all FRMR registrations that are still marked "FRMR_IS_INUSE" + */ + rds_ib_flush_mrs(); + /* * We want to wait for tx and rx completion to finish * before we tear down the connection, but we have to be @@ -1005,6 +1011,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0) && + (atomic_read(&ic->i_fastreg_inuse_count) == 0) && (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index adaa8e99e5a9..06ecf9d2d4bf 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -32,6 +32,24 @@ #include "ib_mr.h" +static inline void +rds_transition_frwr_state(struct rds_ib_mr *ibmr, + enum rds_ib_fr_state old_state, + enum rds_ib_fr_state new_state) +{ + if (cmpxchg(&ibmr->u.frmr.fr_state, + old_state, new_state) == old_state && + old_state == FRMR_IS_INUSE) { + /* enforce order of ibmr->u.frmr.fr_state update + * before decrementing i_fastreg_inuse_count + */ + smp_mb__before_atomic(); + atomic_dec(&ibmr->ic->i_fastreg_inuse_count); + if (waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + } +} + static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, int npages) { @@ -118,13 +136,18 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; + if (cmpxchg(&frmr->fr_state, + FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) + return -EBUSY; + + atomic_inc(&ibmr->ic->i_fastreg_inuse_count); + /* Perform a WR for the fast_reg_mr. Each individual page * in the sg list is added to the fast reg page list and placed * inside the fast_reg_mr WR. The key used is a rolling 8bit * counter, which should guarantee uniqueness. */ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); - frmr->fr_state = FRMR_IS_INUSE; frmr->fr_reg = true; memset(®_wr, 0, sizeof(reg_wr)); @@ -141,7 +164,8 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_post_send(ibmr->ic->i_cm_id->qp, ®_wr.wr, NULL); if (unlikely(ret)) { /* Failure here can be because of -ENOMEM as well */ - frmr->fr_state = FRMR_IS_STALE; + rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); + atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", @@ -268,8 +292,12 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) ret = ib_post_send(i_cm_id->qp, s_wr, NULL); if (unlikely(ret)) { - frmr->fr_state = FRMR_IS_STALE; + rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); frmr->fr_inv = false; + /* enforce order of frmr->fr_inv update + * before incrementing i_fastreg_wrs + */ + smp_mb__before_atomic(); atomic_inc(&ibmr->ic->i_fastreg_wrs); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; @@ -297,7 +325,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) struct rds_ib_frmr *frmr = &ibmr->u.frmr; if (wc->status != IB_WC_SUCCESS) { - frmr->fr_state = FRMR_IS_STALE; + rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); if (rds_conn_up(ic->conn)) rds_ib_conn_error(ic->conn, "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n", @@ -309,8 +337,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) } if (frmr->fr_inv) { - if (frmr->fr_state == FRMR_IS_INUSE) - frmr->fr_state = FRMR_IS_FREE; + rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE); frmr->fr_inv = false; wake_up(&frmr->fr_inv_done); } @@ -320,6 +347,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) wake_up(&frmr->fr_reg_done); } + /* enforce order of frmr->{fr_reg,fr_inv} update + * before incrementing i_fastreg_wrs + */ + smp_mb__before_atomic(); atomic_inc(&ic->i_fastreg_wrs); } -- cgit v1.2.3 From aa4948937b7d5b4a8e6553f4938f8431b2fd783a Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 16 Jul 2019 15:29:23 -0700 Subject: net/rds: Initialize ic->i_fastreg_wrs upon allocation Otherwise, if an IB connection is torn down before "rds_ib_setup_qp" is called, the value of "ic->i_fastreg_wrs" is still at zero (as it wasn't initialized by "rds_ib_setup_qp"). Consequently "rds_ib_conn_path_shutdown" will spin forever, waiting for it to go back to "RDS_IB_DEFAULT_FR_WR", which of course will never happen as there are no outstanding work requests. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 1b6fd6c8b12b..4de0214da63c 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -527,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.qp_type = IB_QPT_RC; attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; - atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -1139,6 +1138,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) spin_lock_init(&ic->i_ack_lock); #endif atomic_set(&ic->i_signaled_sends, 0); + atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); /* * rds_ib_conn_shutdown() waits for these to be emptied so they -- cgit v1.2.3 From 3f05e6886a595c9a29a309c52f45326be917823c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 16 Jul 2019 13:57:30 -0700 Subject: net_sched: unset TCQ_F_CAN_BYPASS when adding filters For qdisc's that support TC filters and set TCQ_F_CAN_BYPASS, notably fq_codel, it makes no sense to let packets bypass the TC filters we setup in any scenario, otherwise our packets steering policy could not be enforced. This can be reproduced easily with the following script: ip li add dev dummy0 type dummy ifconfig dummy0 up tc qd add dev dummy0 root fq_codel tc filter add dev dummy0 parent 8001: protocol arp basic action mirred egress redirect dev lo tc filter add dev dummy0 parent 8001: protocol ip basic action mirred egress redirect dev lo ping -I dummy0 192.168.112.1 Without this patch, packets are sent directly to dummy0 without hitting any of the filters. With this patch, packets are redirected to loopback as expected. This fix is not perfect, it only unsets the flag but does not set it back because we have to save the information somewhere in the qdisc if we really want that. Note, both fq_codel and sfq clear this flag in their ->bind_tcf() but this is clearly not sufficient when we don't use any class ID. Fixes: 23624935e0c4 ("net_sched: TCQ_F_CAN_BYPASS generalization") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + net/sched/sch_fq_codel.c | 2 -- net/sched/sch_sfq.c | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 278014e26aec..d144233423c5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2152,6 +2152,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held); tfilter_put(tp, fh); + q->flags &= ~TCQ_F_CAN_BYPASS; } errout: diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index e2faf33d282b..d59fbcc745d1 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -596,8 +596,6 @@ static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid) static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { - /* we cannot bypass queue discipline anymore */ - sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 420bd8411677..68404a9d2ce4 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -824,8 +824,6 @@ static unsigned long sfq_find(struct Qdisc *sch, u32 classid) static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { - /* we cannot bypass queue discipline anymore */ - sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } -- cgit v1.2.3 From 66f8209547cc11d8e139d45cb7c937c1bbcce182 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 17 Jul 2019 14:41:58 -0700 Subject: fib: relax source validation check for loopback packets In a rare case where we redirect local packets from veth to lo, these packets fail to pass the source validation when rp_filter is turned on, as the tracing shows: <...>-311708 [040] ..s1 7951180.957825: fib_table_lookup: table 254 oif 0 iif 1 src 10.53.180.130 dst 10.53.180.130 tos 0 scope 0 flags 0 <...>-311708 [040] ..s1 7951180.957826: fib_table_lookup_nh: nexthop dev eth0 oif 4 src 10.53.180.130 So, the fib table lookup returns eth0 as the nexthop even though the packets are local and should be routed to loopback nonetheless, but they can't pass the dev match check in fib_info_nh_uses_dev() without this patch. It should be safe to relax this check for this special case, as normally packets coming out of loopback device still have skb_dst so they won't even hit this slow path. Cc: Julian Anastasov Cc: David Ahern Signed-off-by: Cong Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 317339cd7f03..e8bc939b56dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -388,6 +388,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); + /* This is not common, loopback packets retain skb_dst so normally they + * would not even hit this slow path. + */ + dev_match = dev_match || (res.type == RTN_LOCAL && + dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; -- cgit v1.2.3 From 866e5fd8a7123444d865340ff21c1673f74cdecd Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 17 Jul 2019 23:43:44 +0200 Subject: tipc: initialize 'validated' field of received packets The tipc_msg_validate() function leaves a boolean flag 'validated' in the validated buffer's control block, to avoid performing this action more than once. However, at reception of new packets, the position of this field may already have been set by lower layer protocols, so that the packet is erroneously perceived as already validated by TIPC. We fix this by initializing the said field to 'false' before performing the initial validation. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 324a1f91b394..3a5be1d7e572 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1807,6 +1807,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) __skb_queue_head_init(&xmitq); /* Ensure message is well-formed before touching the header */ + TIPC_SKB_CB(skb)->validated = false; if (unlikely(!tipc_msg_validate(&skb))) goto discard; hdr = buf_msg(skb); -- cgit v1.2.3 From 49d05fe2c9d1b4a27761c9807fec39b8155bef9e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 17 Jul 2019 15:08:43 -0700 Subject: ipv6: rt6_check should return NULL if 'from' is NULL Paul reported that l2tp sessions were broken after the commit referenced in the Fixes tag. Prior to this commit rt6_check returned NULL if the rt6_info 'from' was NULL - ie., the dst_entry was disconnected from a FIB entry. Restore that behavior. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Paul Donohue Tested-by: Paul Donohue Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d2e6b31a8d6..6fe3097b9ab7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2563,7 +2563,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, { u32 rt_cookie = 0; - if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || + if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; -- cgit v1.2.3 From 666a3d6e1e6b78df34f59e6c0b8907aa3c8dbb2e Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Thu, 18 Jul 2019 10:19:23 +0800 Subject: udp: Fix typo in net/ipv4/udp.c Signed-off-by: Su Yanjun Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c21862ba9c02..d88821c794fb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2170,7 +2170,7 @@ start_lookup: /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. - * Otherwise, csum completion requires chacksumming packet body, + * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, -- cgit v1.2.3 From 54851aa90cf27041d64b12f65ac72e9f97bd90fd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jul 2019 23:39:33 +0300 Subject: ipv6: Unlink sibling route in case of failure When a route needs to be appended to an existing multipath route, fib6_add_rt2node() first appends it to the siblings list and increments the number of sibling routes on each sibling. Later, the function notifies the route via call_fib6_entry_notifiers(). In case the notification is vetoed, the route is not unlinked from the siblings list, which can result in a use-after-free. Fix this by unlinking the route from the siblings list before returning an error. Audited the rest of the call sites from which the FIB notification chain is called and could not find more problems. Fixes: 2233000cba40 ("net/ipv6: Move call_fib6_entry_notifiers up for route adds") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 49884f96232b..87f47bc55c5e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1151,8 +1151,24 @@ add: err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt, extack); - if (err) + if (err) { + struct fib6_info *sibling, *next_sibling; + + /* If the route has siblings, then it first + * needs to be unlinked from them. + */ + if (!rt->fib6_nsiblings) + return err; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->fib6_siblings, + fib6_siblings) + sibling->fib6_nsiblings--; + rt->fib6_nsiblings = 0; + list_del_init(&rt->fib6_siblings); + rt6_multipath_rebalance(next_sibling); return err; + } } rcu_assign_pointer(rt->fib6_next, iter); -- cgit v1.2.3 From 008cfbaa3f9f84efead76d2cea12b4dd05cce67d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Jul 2019 06:29:56 +0000 Subject: net: dsa: sja1105: Fix missing unlock on error in sk_buff() Add the missing unlock before return from function sk_buff() in the error handling case. Fixes: f3097be21bf1 ("net: dsa: sja1105: Add a state machine for RX timestamping") Signed-off-by: Wei Yongjun Reviewed-by: Vladimir Oltean Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 1d96c9d4a8e9..26363d72d25b 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -216,6 +216,7 @@ static struct sk_buff if (!skb) { dev_err_ratelimited(dp->ds->dev, "Failed to copy stampable skb\n"); + spin_unlock(&sp->data->meta_lock); return NULL; } sja1105_transfer_meta(skb, meta); -- cgit v1.2.3 From 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jul 2019 19:28:14 -0700 Subject: tcp: fix tcp_set_congestion_control() use from bpf hook Neal reported incorrect use of ns_capable() from bpf hook. bpf_setsockopt(...TCP_CONGESTION...) -> tcp_set_congestion_control() -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) -> ns_capable_common() -> current_cred() -> rcu_dereference_protected(current->cred, 1) Accessing 'current' in bpf context makes no sense, since packets are processed from softirq context. As Neal stated : The capability check in tcp_set_congestion_control() was written assuming a system call context, and then was reused from a BPF call site. The fix is to add a new parameter to tcp_set_congestion_control(), so that the ns_capable() call is only performed under the right context. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Eric Dumazet Cc: Lawrence Brakmo Reported-by: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_cong.c | 6 +++--- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index cca3c59b98bf..f42d300f0cfa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1064,7 +1064,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 0f6854ccf894..4e2a79b2fd77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4335,7 +4335,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit); + reinit, true); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7846afacdf0b..776905899ac0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2785,7 +2785,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true); + err = tcp_set_congestion_control(sk, name, true, true, + ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index e1862b64a90f..c445a81d144e 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -333,7 +333,8 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -369,8 +370,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo } else { err = -EBUSY; } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { + } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; } else if (!try_module_get(ca->owner)) { err = -EBUSY; -- cgit v1.2.3