diff options
Diffstat (limited to 'net')
382 files changed, 14042 insertions, 8519 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h new file mode 100644 index 000000000000..d16bb4b14aa1 --- /dev/null +++ b/net/6lowpan/6lowpan_i.h @@ -0,0 +1,28 @@ +#ifndef __6LOWPAN_I_H +#define __6LOWPAN_I_H + +#include <linux/netdevice.h> + +#ifdef CONFIG_6LOWPAN_DEBUGFS +int lowpan_dev_debugfs_init(struct net_device *dev); +void lowpan_dev_debugfs_exit(struct net_device *dev); + +int __init lowpan_debugfs_init(void); +void lowpan_debugfs_exit(void); +#else +static inline int lowpan_dev_debugfs_init(struct net_device *dev) +{ + return 0; +} + +static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { } + +static inline int __init lowpan_debugfs_init(void) +{ + return 0; +} + +static inline void lowpan_debugfs_exit(void) { } +#endif /* CONFIG_6LOWPAN_DEBUGFS */ + +#endif /* __6LOWPAN_I_H */ diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig index 7fa0f382e7d1..9c051512d14f 100644 --- a/net/6lowpan/Kconfig +++ b/net/6lowpan/Kconfig @@ -5,12 +5,21 @@ menuconfig 6LOWPAN This enables IPv6 over Low power Wireless Personal Area Network - "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks. +config 6LOWPAN_DEBUGFS + bool "6LoWPAN debugfs support" + depends on 6LOWPAN + depends on DEBUG_FS + ---help--- + This enables 6LoWPAN debugfs support. For example to manipulate + IPHC context information at runtime. + menuconfig 6LOWPAN_NHC - tristate "Next Header Compression Support" + tristate "Next Header and Generic Header Compression Support" depends on 6LOWPAN default y ---help--- - Support for next header compression. + Support for next header and generic header compression defined in + RFC6282 and RFC7400. if 6LOWPAN_NHC @@ -58,4 +67,38 @@ config 6LOWPAN_NHC_UDP ---help--- 6LoWPAN IPv6 UDP Header compression according to RFC6282. +config 6LOWPAN_GHC_EXT_HDR_HOP + tristate "GHC Hop-by-Hop Options Header Support" + ---help--- + 6LoWPAN IPv6 Hop-by-Hop option generic header compression according + to RFC7400. + +config 6LOWPAN_GHC_UDP + tristate "GHC UDP Support" + ---help--- + 6LoWPAN IPv6 UDP generic header compression according to RFC7400. + +config 6LOWPAN_GHC_ICMPV6 + tristate "GHC ICMPv6 Support" + ---help--- + 6LoWPAN IPv6 ICMPv6 generic header compression according to RFC7400. + +config 6LOWPAN_GHC_EXT_HDR_DEST + tristate "GHC Destination Options Header Support" + ---help--- + 6LoWPAN IPv6 destination option generic header compression according + to RFC7400. + +config 6LOWPAN_GHC_EXT_HDR_FRAG + tristate "GHC Fragmentation Options Header Support" + ---help--- + 6LoWPAN IPv6 fragmentation option generic header compression + according to RFC7400. + +config 6LOWPAN_GHC_EXT_HDR_ROUTE + tristate "GHC Routing Options Header Support" + ---help--- + 6LoWPAN IPv6 routing option generic header compression according + to RFC7400. + endif diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index c6ffc55ee0d7..e44f3bf2dd42 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o 6lowpan-y := core.o iphc.o nhc.o +6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o #rfc6282 nhcs obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o @@ -10,3 +11,11 @@ obj-$(CONFIG_6LOWPAN_NHC_IPV6) += nhc_ipv6.o obj-$(CONFIG_6LOWPAN_NHC_MOBILITY) += nhc_mobility.o obj-$(CONFIG_6LOWPAN_NHC_ROUTING) += nhc_routing.o obj-$(CONFIG_6LOWPAN_NHC_UDP) += nhc_udp.o + +#rfc7400 ghcs +obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_HOP) += nhc_ghc_ext_hop.o +obj-$(CONFIG_6LOWPAN_GHC_UDP) += nhc_ghc_udp.o +obj-$(CONFIG_6LOWPAN_GHC_ICMPV6) += nhc_ghc_icmpv6.o +obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_DEST) += nhc_ghc_ext_dest.o +obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG) += nhc_ghc_ext_frag.o +obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE) += nhc_ghc_ext_route.o diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 83b19e072224..faf65baed617 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -15,19 +15,67 @@ #include <net/6lowpan.h> -void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) +#include "6lowpan_i.h" + +int lowpan_register_netdevice(struct net_device *dev, + enum lowpan_lltypes lltype) { + int ret; + dev->addr_len = EUI64_ADDR_LEN; dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; dev->priv_flags |= IFF_NO_QUEUE; lowpan_priv(dev)->lltype = lltype; + + ret = register_netdevice(dev); + if (ret < 0) + return ret; + + ret = lowpan_dev_debugfs_init(dev); + if (ret < 0) + unregister_netdevice(dev); + + return ret; } -EXPORT_SYMBOL(lowpan_netdev_setup); +EXPORT_SYMBOL(lowpan_register_netdevice); + +int lowpan_register_netdev(struct net_device *dev, + enum lowpan_lltypes lltype) +{ + int ret; + + rtnl_lock(); + ret = lowpan_register_netdevice(dev, lltype); + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(lowpan_register_netdev); + +void lowpan_unregister_netdevice(struct net_device *dev) +{ + unregister_netdevice(dev); + lowpan_dev_debugfs_exit(dev); +} +EXPORT_SYMBOL(lowpan_unregister_netdevice); + +void lowpan_unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + lowpan_unregister_netdevice(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(lowpan_unregister_netdev); static int __init lowpan_module_init(void) { + int ret; + + ret = lowpan_debugfs_init(); + if (ret < 0) + return ret; + request_module_nowait("ipv6"); request_module_nowait("nhc_dest"); @@ -40,6 +88,13 @@ static int __init lowpan_module_init(void) return 0; } + +static void __exit lowpan_module_exit(void) +{ + lowpan_debugfs_exit(); +} + module_init(lowpan_module_init); +module_exit(lowpan_module_exit); MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c new file mode 100644 index 000000000000..88eef84df0fc --- /dev/null +++ b/net/6lowpan/debugfs.c @@ -0,0 +1,53 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de> + * Copyright (c) 2015 Nordic Semiconductor. All Rights Reserved. + */ + +#include <net/6lowpan.h> + +#include "6lowpan_i.h" + +static struct dentry *lowpan_debugfs; + +int lowpan_dev_debugfs_init(struct net_device *dev) +{ + struct lowpan_priv *lpriv = lowpan_priv(dev); + + /* creating the root */ + lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); + if (!lpriv->iface_debugfs) + goto fail; + + return 0; + +fail: + return -EINVAL; +} + +void lowpan_dev_debugfs_exit(struct net_device *dev) +{ + debugfs_remove_recursive(lowpan_priv(dev)->iface_debugfs); +} + +int __init lowpan_debugfs_init(void) +{ + lowpan_debugfs = debugfs_create_dir("6lowpan", NULL); + if (!lowpan_debugfs) + return -EINVAL; + + return 0; +} + +void lowpan_debugfs_exit(void) +{ + debugfs_remove_recursive(lowpan_debugfs); +} diff --git a/net/6lowpan/nhc_ghc_ext_dest.c b/net/6lowpan/nhc_ghc_ext_dest.c new file mode 100644 index 000000000000..9887b3a15348 --- /dev/null +++ b/net/6lowpan/nhc_ghc_ext_dest.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN Extension Header compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_EXT_DEST_IDLEN 1 +#define LOWPAN_GHC_EXT_DEST_ID_0 0xb6 +#define LOWPAN_GHC_EXT_DEST_MASK_0 0xfe + +static void dest_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_EXT_DEST_ID_0; + nhc->idmask[0] = LOWPAN_GHC_EXT_DEST_MASK_0; +} + +LOWPAN_NHC(ghc_ext_dest, "RFC7400 Destination Extension Header", NEXTHDR_DEST, + 0, dest_ghid_setup, LOWPAN_GHC_EXT_DEST_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_ext_dest); +MODULE_DESCRIPTION("6LoWPAN generic header destination extension compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ghc_ext_frag.c b/net/6lowpan/nhc_ghc_ext_frag.c new file mode 100644 index 000000000000..1308b79e939d --- /dev/null +++ b/net/6lowpan/nhc_ghc_ext_frag.c @@ -0,0 +1,28 @@ +/* + * 6LoWPAN Extension Header compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_EXT_FRAG_IDLEN 1 +#define LOWPAN_GHC_EXT_FRAG_ID_0 0xb4 +#define LOWPAN_GHC_EXT_FRAG_MASK_0 0xfe + +static void frag_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_EXT_FRAG_ID_0; + nhc->idmask[0] = LOWPAN_GHC_EXT_FRAG_MASK_0; +} + +LOWPAN_NHC(ghc_ext_frag, "RFC7400 Fragmentation Extension Header", + NEXTHDR_FRAGMENT, 0, frag_ghid_setup, + LOWPAN_GHC_EXT_FRAG_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_ext_frag); +MODULE_DESCRIPTION("6LoWPAN generic header fragmentation extension compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ghc_ext_hop.c b/net/6lowpan/nhc_ghc_ext_hop.c new file mode 100644 index 000000000000..baec86fd1974 --- /dev/null +++ b/net/6lowpan/nhc_ghc_ext_hop.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN Extension Header compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_EXT_HOP_IDLEN 1 +#define LOWPAN_GHC_EXT_HOP_ID_0 0xb0 +#define LOWPAN_GHC_EXT_HOP_MASK_0 0xfe + +static void hop_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_EXT_HOP_ID_0; + nhc->idmask[0] = LOWPAN_GHC_EXT_HOP_MASK_0; +} + +LOWPAN_NHC(ghc_ext_hop, "RFC7400 Hop-by-Hop Extension Header", NEXTHDR_HOP, 0, + hop_ghid_setup, LOWPAN_GHC_EXT_HOP_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_ext_hop); +MODULE_DESCRIPTION("6LoWPAN generic header hop-by-hop extension compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ghc_ext_route.c b/net/6lowpan/nhc_ghc_ext_route.c new file mode 100644 index 000000000000..d7e5bd791c62 --- /dev/null +++ b/net/6lowpan/nhc_ghc_ext_route.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN Extension Header compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_EXT_ROUTE_IDLEN 1 +#define LOWPAN_GHC_EXT_ROUTE_ID_0 0xb2 +#define LOWPAN_GHC_EXT_ROUTE_MASK_0 0xfe + +static void route_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_EXT_ROUTE_ID_0; + nhc->idmask[0] = LOWPAN_GHC_EXT_ROUTE_MASK_0; +} + +LOWPAN_NHC(ghc_ext_route, "RFC7400 Routing Extension Header", NEXTHDR_ROUTING, + 0, route_ghid_setup, LOWPAN_GHC_EXT_ROUTE_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_ext_route); +MODULE_DESCRIPTION("6LoWPAN generic header routing extension compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ghc_icmpv6.c b/net/6lowpan/nhc_ghc_icmpv6.c new file mode 100644 index 000000000000..32e7c2c66bbc --- /dev/null +++ b/net/6lowpan/nhc_ghc_icmpv6.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN ICMPv6 compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_ICMPV6_IDLEN 1 +#define LOWPAN_GHC_ICMPV6_ID_0 0xdf +#define LOWPAN_GHC_ICMPV6_MASK_0 0xff + +static void icmpv6_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_ICMPV6_ID_0; + nhc->idmask[0] = LOWPAN_GHC_ICMPV6_MASK_0; +} + +LOWPAN_NHC(ghc_icmpv6, "RFC7400 ICMPv6", NEXTHDR_ICMP, 0, + icmpv6_ghid_setup, LOWPAN_GHC_ICMPV6_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_icmpv6); +MODULE_DESCRIPTION("6LoWPAN generic header ICMPv6 compression"); +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/nhc_ghc_udp.c b/net/6lowpan/nhc_ghc_udp.c new file mode 100644 index 000000000000..17beefa52ca8 --- /dev/null +++ b/net/6lowpan/nhc_ghc_udp.c @@ -0,0 +1,27 @@ +/* + * 6LoWPAN UDP compression according to RFC7400 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "nhc.h" + +#define LOWPAN_GHC_UDP_IDLEN 1 +#define LOWPAN_GHC_UDP_ID_0 0xd0 +#define LOWPAN_GHC_UDP_MASK_0 0xf8 + +static void udp_ghid_setup(struct lowpan_nhc *nhc) +{ + nhc->id[0] = LOWPAN_GHC_UDP_ID_0; + nhc->idmask[0] = LOWPAN_GHC_UDP_MASK_0; +} + +LOWPAN_NHC(ghc_udp, "RFC7400 UDP", NEXTHDR_UDP, 0, + udp_ghid_setup, LOWPAN_GHC_UDP_IDLEN, NULL, NULL); + +module_lowpan_nhc(ghc_udp); +MODULE_DESCRIPTION("6LoWPAN generic header UDP compression"); +MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 496b27588493..e2ed69850489 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp) skb->pkt_type = PACKET_HOST; } - if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) && + !netif_is_macvlan_port(vlan_dev) && + !netif_is_bridge_port(vlan_dev)) { unsigned int offset = skb->data - skb_mac_header(skb); /* diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index fded86508117..ad5e2fd1012c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -30,6 +30,7 @@ #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <net/arp.h> +#include <net/switchdev.h> #include "vlan.h" #include "vlanproc.h" @@ -542,9 +543,9 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); - dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG | + dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | - NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM | + NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; dev->features |= real_dev->vlan_features | NETIF_F_LLTX | @@ -774,6 +775,12 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_fdb_add = switchdev_port_fdb_add, + .ndo_fdb_del = switchdev_port_fdb_del, + .ndo_fdb_dump = switchdev_port_fdb_dump, + .ndo_bridge_setlink = switchdev_port_bridge_setlink, + .ndo_bridge_getlink = switchdev_port_bridge_getlink, + .ndo_bridge_dellink = switchdev_port_bridge_dellink, .ndo_get_lock_subclass = vlan_dev_get_lock_subclass, .ndo_get_iflink = vlan_dev_get_iflink, }; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 3827760c8eef..4acb1d5417aa 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -105,7 +105,7 @@ static struct list_head virtio_chan_list; /* How many bytes left in this page. */ static unsigned int rest_of_page(void *data) { - return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); + return PAGE_SIZE - offset_in_page(data); } /** @@ -143,7 +143,6 @@ static void p9_virtio_close(struct p9_client *client) static void req_done(struct virtqueue *vq) { struct virtio_chan *chan = vq->vdev->priv; - struct p9_fcall *rc; unsigned int len; struct p9_req_t *req; unsigned long flags; @@ -152,8 +151,8 @@ static void req_done(struct virtqueue *vq) while (1) { spin_lock_irqsave(&chan->lock, flags); - rc = virtqueue_get_buf(chan->vq, &len); - if (rc == NULL) { + req = virtqueue_get_buf(chan->vq, &len); + if (req == NULL) { spin_unlock_irqrestore(&chan->lock, flags); break; } @@ -161,9 +160,6 @@ static void req_done(struct virtqueue *vq) spin_unlock_irqrestore(&chan->lock, flags); /* Wakeup if anyone waiting for VirtIO ring space. */ wake_up(chan->vc_wq); - p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc); - p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag); - req = p9_tag_lookup(chan->client, rc->tag); p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } @@ -284,7 +280,7 @@ req_retry: if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; - err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -369,7 +365,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, return -ENOMEM; *need_drop = 0; - p -= (*offs = (unsigned long)p % PAGE_SIZE); + p -= (*offs = offset_in_page(p)); for (index = 0; index < nr_pages; index++) { if (is_vmalloc_addr(p)) (*pages)[index] = vmalloc_to_page(p); @@ -469,7 +465,7 @@ req_retry_pinned: } BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); - err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { diff --git a/net/Kconfig b/net/Kconfig index 127da94ae25e..174354618f8a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -48,6 +48,9 @@ config COMPAT_NETLINK_MESSAGES config NET_INGRESS bool +config NET_EGRESS + bool + menu "Networking options" source "net/packet/Kconfig" @@ -250,9 +253,14 @@ config XPS depends on SMP default y +config SOCK_CGROUP_DATA + bool + default n + config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis. @@ -260,6 +268,7 @@ config CGROUP_NET_PRIO config CGROUP_NET_CLASSID bool "Network classid cgroup" depends on CGROUPS + select SOCK_CGROUP_DATA ---help--- Cgroup subsystem for use as general purpose socket classid marker that is being used in cls_cgroup and for netfilter matching. diff --git a/net/atm/common.c b/net/atm/common.c index 49a872db7e42..6dc12305799e 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -96,7 +96,7 @@ static void vcc_def_wakeup(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up(&wq->wait); rcu_read_unlock(); } @@ -117,7 +117,7 @@ static void vcc_write_space(struct sock *sk) if (vcc_writable(sk)) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); diff --git a/net/atm/mpc.h b/net/atm/mpc.h index 0919a88bbc70..cfc7b745aa91 100644 --- a/net/atm/mpc.h +++ b/net/atm/mpc.h @@ -21,11 +21,11 @@ struct mpoa_client { uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ rwlock_t ingress_lock; - struct in_cache_ops *in_ops; /* ingress cache operations */ + const struct in_cache_ops *in_ops; /* ingress cache operations */ in_cache_entry *in_cache; /* the ingress cache of this MPC */ rwlock_t egress_lock; - struct eg_cache_ops *eg_ops; /* egress cache operations */ + const struct eg_cache_ops *eg_ops; /* egress cache operations */ eg_cache_entry *eg_cache; /* the egress cache of this MPC */ uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c index d1b2d9a03144..9e60e74c807d 100644 --- a/net/atm/mpoa_caches.c +++ b/net/atm/mpoa_caches.c @@ -534,7 +534,7 @@ static void eg_destroy_cache(struct mpoa_client *mpc) } -static struct in_cache_ops ingress_ops = { +static const struct in_cache_ops ingress_ops = { in_cache_add_entry, /* add_entry */ in_cache_get, /* get */ in_cache_get_with_mask, /* get_with_mask */ @@ -548,7 +548,7 @@ static struct in_cache_ops ingress_ops = { in_destroy_cache /* destroy_cache */ }; -static struct eg_cache_ops egress_ops = { +static const struct eg_cache_ops egress_ops = { eg_cache_add_entry, /* add_entry */ eg_cache_get_by_cache_id, /* get_by_cache_id */ eg_cache_get_by_tag, /* get_by_tag */ diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ae3a47f9d1d5..fbd0acf80b13 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 912d9c36fb1c..df625de55ef2 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -185,7 +185,8 @@ unlock: static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num) { - int chunk_size, ret = -ENOMEM, if_offset; + int ret = -ENOMEM; + size_t chunk_size, if_offset; void *data_ptr = NULL; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); @@ -203,8 +204,9 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); /* copy second part */ + if_offset = (del_if_num + 1) * chunk_size; memcpy((char *)data_ptr + del_if_num * chunk_size, - orig_node->bat_iv.bcast_own + ((del_if_num + 1) * chunk_size), + (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, (max_if_num - del_if_num) * chunk_size); free_bcast_own: @@ -361,7 +363,6 @@ batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; - batadv_ogm_packet->flags = BATADV_PRIMARIES_FIRST_HOP; batadv_ogm_packet->ttl = BATADV_TTL; } @@ -842,8 +843,6 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, "Forwarding packet: tq: %i, ttl: %i\n", batadv_ogm_packet->tq, batadv_ogm_packet->ttl); - /* switch of primaries first hop flag when forwarding */ - batadv_ogm_packet->flags &= ~BATADV_PRIMARIES_FIRST_HOP; if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else @@ -1379,6 +1378,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; @@ -1423,6 +1423,13 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, goto out; } + if (is_single_hop_neigh) { + hardif_neigh = batadv_hardif_neigh_get(if_incoming, + ethhdr->h_source); + if (hardif_neigh) + hardif_neigh->last_seen = jiffies; + } + router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { router_router = batadv_orig_router_get(router->orig_node, @@ -1557,6 +1564,8 @@ out: batadv_neigh_node_free_ref(router_router); if (orig_neigh_router) batadv_neigh_node_free_ref(orig_neigh_router); + if (hardif_neigh) + batadv_hardif_neigh_free_ref(hardif_neigh); kfree_skb(skb_priv); } @@ -1862,6 +1871,58 @@ next: } /** + * batadv_iv_hardif_neigh_print - print a single hop neighbour node + * @seq: neighbour table seq_file struct + * @hardif_neigh: hardif neighbour information + */ +static void +batadv_iv_hardif_neigh_print(struct seq_file *seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + int last_secs, last_msecs; + + last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000; + last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000; + + seq_printf(seq, " %10s %pM %4i.%03is\n", + hardif_neigh->if_incoming->net_dev->name, + hardif_neigh->addr, last_secs, last_msecs); +} + +/** + * batadv_iv_ogm_neigh_print - print the single hop neighbour list + * @bat_priv: the bat priv with all the soft interface information + * @seq: neighbour table seq_file struct + */ +static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_hardif_neigh_node *hardif_neigh; + struct batadv_hard_iface *hard_iface; + int batman_count = 0; + + seq_printf(seq, " %10s %-13s %s\n", + "IF", "Neighbor", "last-seen"); + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != net_dev) + continue; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + batadv_iv_hardif_neigh_print(seq, hardif_neigh); + batman_count++; + } + } + rcu_read_unlock(); + + if (batman_count == 0) + seq_puts(seq, "No batman nodes in range ...\n"); +} + +/** * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor @@ -1902,8 +1963,8 @@ out: } /** - * batadv_iv_ogm_neigh_is_eob - check if neigh1 is equally good or better than - * neigh2 from the metric prospective + * batadv_iv_ogm_neigh_is_sob - check if neigh1 is similarly good or better + * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison @@ -1913,7 +1974,7 @@ out: * the metric via neigh2, false otherwise. */ static bool -batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1, +batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) @@ -1953,7 +2014,8 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .bat_ogm_schedule = batadv_iv_ogm_schedule, .bat_ogm_emit = batadv_iv_ogm_emit, .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp, - .bat_neigh_is_equiv_or_better = batadv_iv_ogm_neigh_is_eob, + .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob, + .bat_neigh_print = batadv_iv_neigh_print, .bat_orig_print = batadv_iv_ogm_orig_print, .bat_orig_free = batadv_iv_ogm_orig_free, .bat_orig_add_if = batadv_iv_ogm_orig_add_if, diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 191a70290dca..c24c481b666f 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -127,21 +127,17 @@ batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw) } /* finally deinitialize the claim */ -static void batadv_claim_free_rcu(struct rcu_head *rcu) +static void batadv_claim_release(struct batadv_bla_claim *claim) { - struct batadv_bla_claim *claim; - - claim = container_of(rcu, struct batadv_bla_claim, rcu); - batadv_backbone_gw_free_ref(claim->backbone_gw); - kfree(claim); + kfree_rcu(claim, rcu); } /* free a claim, call claim_free_rcu if its the last reference */ static void batadv_claim_free_ref(struct batadv_bla_claim *claim) { if (atomic_dec_and_test(&claim->refcount)) - call_rcu(&claim->rcu, batadv_claim_free_rcu); + batadv_claim_release(claim); } /** @@ -260,7 +256,9 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) } /* all claims gone, initialize CRC */ + spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc = BATADV_BLA_CRC_INIT; + spin_unlock_bh(&backbone_gw->crc_lock); } /** @@ -408,6 +406,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, entry->lasttime = jiffies; entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; + spin_lock_init(&entry->crc_lock); atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); @@ -557,7 +556,9 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, __be16 crc; memcpy(mac, batadv_announce_mac, 4); + spin_lock_bh(&backbone_gw->crc_lock); crc = htons(backbone_gw->crc); + spin_unlock_bh(&backbone_gw->crc_lock); memcpy(&mac[4], &crc, 2); batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, @@ -618,14 +619,18 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); + spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&claim->backbone_gw->crc_lock); batadv_backbone_gw_free_ref(claim->backbone_gw); } /* set (new) backbone gw */ atomic_inc(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; + spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&backbone_gw->crc_lock); backbone_gw->lasttime = jiffies; claim_free_ref: @@ -653,7 +658,9 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_choose_claim, claim); batadv_claim_free_ref(claim); /* reference from the hash is gone */ + spin_lock_bh(&claim->backbone_gw->crc_lock); claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&claim->backbone_gw->crc_lock); /* don't need the reference from hash_find() anymore */ batadv_claim_free_ref(claim); @@ -664,7 +671,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; - u16 crc; + u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return 0; @@ -683,12 +690,16 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", BATADV_PRINT_VID(vid), backbone_gw->orig, crc); - if (backbone_gw->crc != crc) { + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + + if (backbone_crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", backbone_gw->orig, BATADV_PRINT_VID(backbone_gw->vid), - backbone_gw->crc, crc); + backbone_crc, crc); batadv_bla_send_request(backbone_gw); } else { @@ -1153,6 +1164,26 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, } } +/** + * batadv_bla_status_update - purge bla interfaces if necessary + * @net_dev: the soft interface net device + */ +void batadv_bla_status_update(struct net_device *net_dev) +{ + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return; + + /* this function already purges everything when bla is disabled, + * so just call that one. + */ + batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); + batadv_hardif_free_ref(primary_if); +} + /* periodic work to do: * * purge structures when they are too old * * send announcements @@ -1647,6 +1678,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; + u16 backbone_crc; u32 i; bool is_own; u8 *primary_addr; @@ -1669,11 +1701,15 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) hlist_for_each_entry_rcu(claim, head, hash_entry) { is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); + + spin_lock_bh(&claim->backbone_gw->crc_lock); + backbone_crc = claim->backbone_gw->crc; + spin_unlock_bh(&claim->backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", claim->addr, BATADV_PRINT_VID(claim->vid), claim->backbone_gw->orig, (is_own ? 'x' : ' '), - claim->backbone_gw->crc); + backbone_crc); } rcu_read_unlock(); } @@ -1692,6 +1728,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_hard_iface *primary_if; struct hlist_head *head; int secs, msecs; + u16 backbone_crc; u32 i; bool is_own; u8 *primary_addr; @@ -1722,10 +1759,14 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) if (is_own) continue; + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n", backbone_gw->orig, BATADV_PRINT_VID(backbone_gw->vid), secs, - msecs, backbone_gw->crc); + msecs, backbone_crc); } rcu_read_unlock(); } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 025152b34282..7ea199b8b5ab 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -22,6 +22,7 @@ #include <linux/types.h> +struct net_device; struct seq_file; struct sk_buff; @@ -42,6 +43,7 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif); +void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index c4c1e8030ba0..037ad0a5f485 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -262,6 +262,13 @@ static int batadv_algorithms_open(struct inode *inode, struct file *file) return single_open(file, batadv_algo_seq_print_text, NULL); } +static int neighbors_open(struct inode *inode, struct file *file) +{ + struct net_device *net_dev = (struct net_device *)inode->i_private; + + return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev); +} + static int batadv_originators_open(struct inode *inode, struct file *file) { struct net_device *net_dev = (struct net_device *)inode->i_private; @@ -375,6 +382,7 @@ static struct batadv_debuginfo *batadv_general_debuginfos[] = { }; /* The following attributes are per soft interface */ +static BATADV_DEBUGINFO(neighbors, S_IRUGO, neighbors_open); static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open); static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open); static BATADV_DEBUGINFO(transtable_global, S_IRUGO, @@ -394,6 +402,7 @@ static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open); #endif static struct batadv_debuginfo *batadv_mesh_debuginfos[] = { + &batadv_debuginfo_neighbors, &batadv_debuginfo_originators, &batadv_debuginfo_gateways, &batadv_debuginfo_transtable_global, diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 83bc1aaf5800..a49c705fb86b 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -566,6 +566,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) int select; batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key; struct batadv_dat_candidate *res; + struct batadv_dat_entry dat; if (!bat_priv->orig_hash) return NULL; @@ -575,7 +576,9 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) if (!res) return NULL; - ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst, + dat.ip = ip_dst; + dat.vid = 0; + ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat, BATADV_DAT_ADDR_MAX); batadv_dbg(BATADV_DBG_DAT, bat_priv, diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 700c96c82a15..20d9282f895b 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -71,14 +71,14 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { chain = &orig_node->fragments[i]; - spin_lock_bh(&orig_node->fragments[i].lock); + spin_lock_bh(&chain->lock); if (!check_cb || check_cb(chain)) { - batadv_frag_clear_chain(&orig_node->fragments[i].head); - orig_node->fragments[i].size = 0; + batadv_frag_clear_chain(&chain->head); + chain->size = 0; } - spin_unlock_bh(&orig_node->fragments[i].lock); + spin_unlock_bh(&chain->lock); } } diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 0cb5e6b6f6d4..b51bface8bdd 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -31,27 +31,23 @@ #include "packet.h" /** - * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download - * and upload bandwidth information + * batadv_parse_throughput - parse supplied string buffer to extract throughput + * information * @net_dev: the soft interface net device * @buff: string buffer to parse - * @down: pointer holding the returned download bandwidth information - * @up: pointer holding the returned upload bandwidth information + * @description: text shown when throughput string cannot be parsed + * @throughput: pointer holding the returned throughput information * * Returns false on parse error and true otherwise. */ -static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, - u32 *down, u32 *up) +static bool batadv_parse_throughput(struct net_device *net_dev, char *buff, + const char *description, u32 *throughput) { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; - char *slash_ptr, *tmp_ptr; - u64 ldown, lup; + u64 lthroughput; + char *tmp_ptr; int ret; - slash_ptr = strchr(buff, '/'); - if (slash_ptr) - *slash_ptr = 0; - if (strlen(buff) > 4) { tmp_ptr = buff + strlen(buff) - 4; @@ -63,90 +59,75 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = kstrtou64(buff, 10, &ldown); + ret = kstrtou64(buff, 10, <hroughput); if (ret) { batadv_err(net_dev, - "Download speed of gateway mode invalid: %s\n", - buff); + "Invalid throughput speed for %s: %s\n", + description, buff); return false; } switch (bw_unit_type) { case BATADV_BW_UNIT_MBIT: /* prevent overflow */ - if (U64_MAX / 10 < ldown) { + if (U64_MAX / 10 < lthroughput) { batadv_err(net_dev, - "Download speed of gateway mode too large: %s\n", - buff); + "Throughput speed for %s too large: %s\n", + description, buff); return false; } - ldown *= 10; + lthroughput *= 10; break; case BATADV_BW_UNIT_KBIT: default: - ldown = div_u64(ldown, 100); + lthroughput = div_u64(lthroughput, 100); break; } - if (U32_MAX < ldown) { + if (lthroughput > U32_MAX) { batadv_err(net_dev, - "Download speed of gateway mode too large: %s\n", - buff); + "Throughput speed for %s too large: %s\n", + description, buff); return false; } - *down = ldown; - - /* we also got some upload info */ - if (slash_ptr) { - bw_unit_type = BATADV_BW_UNIT_KBIT; - - if (strlen(slash_ptr + 1) > 4) { - tmp_ptr = slash_ptr + 1 - 4 + strlen(slash_ptr + 1); + *throughput = lthroughput; - if (strncasecmp(tmp_ptr, "mbit", 4) == 0) - bw_unit_type = BATADV_BW_UNIT_MBIT; + return true; +} - if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) || - (bw_unit_type == BATADV_BW_UNIT_MBIT)) - *tmp_ptr = '\0'; - } +/** + * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download + * and upload bandwidth information + * @net_dev: the soft interface net device + * @buff: string buffer to parse + * @down: pointer holding the returned download bandwidth information + * @up: pointer holding the returned upload bandwidth information + * + * Return: false on parse error and true otherwise. + */ +static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, + u32 *down, u32 *up) +{ + char *slash_ptr; + bool ret; - ret = kstrtou64(slash_ptr + 1, 10, &lup); - if (ret) { - batadv_err(net_dev, - "Upload speed of gateway mode invalid: %s\n", - slash_ptr + 1); - return false; - } + slash_ptr = strchr(buff, '/'); + if (slash_ptr) + *slash_ptr = 0; - switch (bw_unit_type) { - case BATADV_BW_UNIT_MBIT: - /* prevent overflow */ - if (U64_MAX / 10 < lup) { - batadv_err(net_dev, - "Upload speed of gateway mode too large: %s\n", - slash_ptr + 1); - return false; - } - - lup *= 10; - break; - case BATADV_BW_UNIT_KBIT: - default: - lup = div_u64(lup, 100); - break; - } + ret = batadv_parse_throughput(net_dev, buff, "download gateway speed", + down); + if (!ret) + return false; - if (U32_MAX < lup) { - batadv_err(net_dev, - "Upload speed of gateway mode too large: %s\n", - slash_ptr + 1); + /* we also got some upload info */ + if (slash_ptr) { + ret = batadv_parse_throughput(net_dev, slash_ptr + 1, + "upload gateway speed", up); + if (!ret) return false; - } - - *up = lup; } return true; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f11345e163d7..01acccc4d218 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -32,6 +32,7 @@ #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <linux/workqueue.h> #include <net/net_namespace.h> @@ -464,7 +465,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); - ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface); + ret = netdev_master_upper_dev_link(hard_iface->net_dev, + soft_iface, NULL, NULL); if (ret) goto err_dev; @@ -638,9 +640,12 @@ batadv_hardif_add_interface(struct net_device *net_dev) goto free_sysfs; INIT_LIST_HEAD(&hard_iface->list); + INIT_HLIST_HEAD(&hard_iface->neigh_list); INIT_WORK(&hard_iface->cleanup_work, batadv_hardif_remove_interface_finish); + spin_lock_init(&hard_iface->neigh_list_lock); + hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; if (batadv_is_wifi_netdev(net_dev)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; @@ -708,7 +713,8 @@ static int batadv_hard_if_event(struct notifier_block *this, } hard_iface = batadv_hardif_get_by_netdev(net_dev); - if (!hard_iface && event == NETDEV_REGISTER) + if (!hard_iface && (event == NETDEV_REGISTER || + event == NETDEV_POST_TYPE_CHANGE)) hard_iface = batadv_hardif_add_interface(net_dev); if (!hard_iface) @@ -723,6 +729,7 @@ static int batadv_hard_if_event(struct notifier_block *this, batadv_hardif_deactivate_interface(hard_iface); break; case NETDEV_UNREGISTER: + case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); batadv_hardif_remove_interface(hard_iface); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 5a31420513e1..7b12ea8ea29d 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -75,18 +75,6 @@ batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface) call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu); } -/** - * batadv_hardif_free_ref_now - decrement the hard interface refcounter and - * possibly free it (without rcu callback) - * @hard_iface: the hard interface to free - */ -static inline void -batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface) -{ - if (atomic_dec_and_test(&hard_iface->refcount)) - batadv_hardif_free_rcu(&hard_iface->rcu); -} - static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d7f17c1aa4a4..4b5d61fbadb1 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -552,7 +552,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) !bat_algo_ops->bat_ogm_schedule || !bat_algo_ops->bat_ogm_emit || !bat_algo_ops->bat_neigh_cmp || - !bat_algo_ops->bat_neigh_is_equiv_or_better) { + !bat_algo_ops->bat_neigh_is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; @@ -747,7 +747,7 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, struct batadv_tvlv_container *tvlv) { - lockdep_assert_held(&bat_priv->tvlv.handler_list_lock); + lockdep_assert_held(&bat_priv->tvlv.container_list_lock); if (!tvlv) return; @@ -908,7 +908,7 @@ end: * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @tvlv_handler: tvlv callback function handling the tvlv content - * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet * @orig_node: orig node emitting the ogm packet * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet @@ -961,7 +961,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, * batadv_tvlv_containers_process - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information - * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet + * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet * @orig_node: orig node emitting the ogm packet * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet @@ -1143,15 +1143,14 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; struct batadv_orig_node *orig_node; - struct sk_buff *skb = NULL; + struct sk_buff *skb; unsigned char *tvlv_buff; unsigned int tvlv_len; ssize_t hdr_len = sizeof(*unicast_tvlv_packet); - bool ret = false; orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) - goto out; + return; tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; @@ -1180,14 +1179,10 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, tvlv_buff += sizeof(*tvlv_hdr); memcpy(tvlv_buff, tvlv_value, tvlv_value_len); - if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP) - ret = true; - -out: - if (skb && !ret) + if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP) kfree_skb(skb); - if (orig_node) - batadv_orig_node_free_ref(orig_node); +out: + batadv_orig_node_free_ref(orig_node); } /** diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index ebd8af0a1eb0..9dbd9107e7e1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.2" +#define BATADV_SOURCE_VERSION "2016.0" #endif /* B.A.T.M.A.N. parameters */ @@ -109,7 +109,7 @@ #define BATADV_MAX_AGGREGATION_MS 100 #define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */ -#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 3) +#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index eb76386f8d4b..75fa5013af72 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -802,7 +802,9 @@ void batadv_mcast_free(struct batadv_priv *bat_priv) batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1); + spin_lock_bh(&bat_priv->tt.commit_lock); batadv_mcast_mla_tt_retract(bat_priv, NULL); + spin_unlock_bh(&bat_priv->tt.commit_lock); } /** diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index f5276be2c77c..cc63b44f0d2e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -203,28 +203,25 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node) } /** - * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove - * its refcount on the orig_node - * @rcu: rcu pointer of the nc node + * batadv_nc_node_release - release nc_node from lists and queue for free after + * rcu grace period + * @nc_node: the nc node to free */ -static void batadv_nc_node_free_rcu(struct rcu_head *rcu) +static void batadv_nc_node_release(struct batadv_nc_node *nc_node) { - struct batadv_nc_node *nc_node; - - nc_node = container_of(rcu, struct batadv_nc_node, rcu); batadv_orig_node_free_ref(nc_node->orig_node); - kfree(nc_node); + kfree_rcu(nc_node, rcu); } /** - * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly - * frees it + * batadv_nc_node_free_ref - decrement the nc node refcounter and possibly + * release it * @nc_node: the nc node to free */ static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node) { if (atomic_dec_and_test(&nc_node->refcount)) - call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu); + batadv_nc_node_release(nc_node); } /** @@ -244,9 +241,7 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path) */ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet) { - if (nc_packet->skb) - kfree_skb(nc_packet->skb); - + kfree_skb(nc_packet->skb); batadv_nc_path_free_ref(nc_packet->nc_path); kfree(nc_packet); } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 7486df9ed48d..fe578f75c391 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -163,92 +163,101 @@ err: } /** - * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object - * @rcu: rcu pointer of the neigh_ifinfo object + * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for + * free after rcu grace period + * @neigh_ifinfo: the neigh_ifinfo object to release */ -static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu) +static void +batadv_neigh_ifinfo_release(struct batadv_neigh_ifinfo *neigh_ifinfo) { - struct batadv_neigh_ifinfo *neigh_ifinfo; - - neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu); - if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing); + batadv_hardif_free_ref(neigh_ifinfo->if_outgoing); - kfree(neigh_ifinfo); + kfree_rcu(neigh_ifinfo, rcu); } /** - * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free - * the neigh_ifinfo (without rcu callback) + * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly release + * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ -static void -batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo) +void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu); + batadv_neigh_ifinfo_release(neigh_ifinfo); } /** - * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free - * the neigh_ifinfo - * @neigh_ifinfo: the neigh_ifinfo object to release + * batadv_hardif_neigh_release - release hardif neigh node from lists and + * queue for free after rcu grace period + * @hardif_neigh: hardif neigh neighbor to free */ -void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo) +static void +batadv_hardif_neigh_release(struct batadv_hardif_neigh_node *hardif_neigh) { - if (atomic_dec_and_test(&neigh_ifinfo->refcount)) - call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu); + spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + hlist_del_init_rcu(&hardif_neigh->list); + spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock); + + batadv_hardif_free_ref(hardif_neigh->if_incoming); + kfree_rcu(hardif_neigh, rcu); +} + +/** + * batadv_hardif_neigh_free_ref - decrement the hardif neighbors refcounter + * and possibly release it + * @hardif_neigh: hardif neigh neighbor to free + */ +void batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh) +{ + if (atomic_dec_and_test(&hardif_neigh->refcount)) + batadv_hardif_neigh_release(hardif_neigh); } /** - * batadv_neigh_node_free_rcu - free the neigh_node - * @rcu: rcu pointer of the neigh_node + * batadv_neigh_node_release - release neigh_node from lists and queue for + * free after rcu grace period + * @neigh_node: neigh neighbor to free */ -static void batadv_neigh_node_free_rcu(struct rcu_head *rcu) +static void batadv_neigh_node_release(struct batadv_neigh_node *neigh_node) { struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node; + struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; - neigh_node = container_of(rcu, struct batadv_neigh_node, rcu); bao = neigh_node->orig_node->bat_priv->bat_algo_ops; hlist_for_each_entry_safe(neigh_ifinfo, node_tmp, &neigh_node->ifinfo_list, list) { - batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo); + batadv_neigh_ifinfo_free_ref(neigh_ifinfo); + } + + hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, + neigh_node->addr); + if (hardif_neigh) { + /* batadv_hardif_neigh_get() increases refcount too */ + batadv_hardif_neigh_free_ref(hardif_neigh); + batadv_hardif_neigh_free_ref(hardif_neigh); } if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); - batadv_hardif_free_ref_now(neigh_node->if_incoming); - - kfree(neigh_node); -} + batadv_hardif_free_ref(neigh_node->if_incoming); -/** - * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter - * and possibly free it (without rcu callback) - * @neigh_node: neigh neighbor to free - */ -static void -batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node) -{ - if (atomic_dec_and_test(&neigh_node->refcount)) - batadv_neigh_node_free_rcu(&neigh_node->rcu); + kfree_rcu(neigh_node, rcu); } /** * batadv_neigh_node_free_ref - decrement the neighbors refcounter - * and possibly free it + * and possibly release it * @neigh_node: neigh neighbor to free */ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node) { if (atomic_dec_and_test(&neigh_node->refcount)) - call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu); + batadv_neigh_node_release(neigh_node); } /** @@ -479,6 +488,106 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, } /** + * batadv_hardif_neigh_create - create a hardif neighbour node + * @hard_iface: the interface this neighbour is connected to + * @neigh_addr: the interface address of the neighbour to retrieve + * + * Returns the hardif neighbour node if found or created or NULL otherwise. + */ +static struct batadv_hardif_neigh_node * +batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_hardif_neigh_node *hardif_neigh = NULL; + + spin_lock_bh(&hard_iface->neigh_list_lock); + + /* check if neighbor hasn't been added in the meantime */ + hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); + if (hardif_neigh) + goto out; + + if (!atomic_inc_not_zero(&hard_iface->refcount)) + goto out; + + hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); + if (!hardif_neigh) { + batadv_hardif_free_ref(hard_iface); + goto out; + } + + INIT_HLIST_NODE(&hardif_neigh->list); + ether_addr_copy(hardif_neigh->addr, neigh_addr); + hardif_neigh->if_incoming = hard_iface; + hardif_neigh->last_seen = jiffies; + + atomic_set(&hardif_neigh->refcount, 1); + + if (bat_priv->bat_algo_ops->bat_hardif_neigh_init) + bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh); + + hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list); + +out: + spin_unlock_bh(&hard_iface->neigh_list_lock); + return hardif_neigh; +} + +/** + * batadv_hardif_neigh_get_or_create - retrieve or create a hardif neighbour + * node + * @hard_iface: the interface this neighbour is connected to + * @neigh_addr: the interface address of the neighbour to retrieve + * + * Returns the hardif neighbour node if found or created or NULL otherwise. + */ +static struct batadv_hardif_neigh_node * +batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_hardif_neigh_node *hardif_neigh = NULL; + + /* first check without locking to avoid the overhead */ + hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr); + if (hardif_neigh) + return hardif_neigh; + + return batadv_hardif_neigh_create(hard_iface, neigh_addr); +} + +/** + * batadv_hardif_neigh_get - retrieve a hardif neighbour from the list + * @hard_iface: the interface where this neighbour is connected to + * @neigh_addr: the address of the neighbour + * + * Looks for and possibly returns a neighbour belonging to this hard interface. + * Returns NULL if the neighbour is not found. + */ +struct batadv_hardif_neigh_node * +batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) +{ + struct batadv_hardif_neigh_node *tmp_hardif_neigh, *hardif_neigh = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_hardif_neigh, + &hard_iface->neigh_list, list) { + if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr)) + continue; + + if (!atomic_inc_not_zero(&tmp_hardif_neigh->refcount)) + continue; + + hardif_neigh = tmp_hardif_neigh; + break; + } + rcu_read_unlock(); + + return hardif_neigh; +} + +/** * batadv_neigh_node_new - create and init a new neigh_node object * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to @@ -493,11 +602,17 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; + struct batadv_hardif_neigh_node *hardif_neigh = NULL; neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) goto out; + hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface, + neigh_addr); + if (!hardif_neigh) + goto out; + neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); if (!neigh_node) goto out; @@ -523,117 +638,147 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); + /* increment unique neighbor refcount */ + atomic_inc(&hardif_neigh->refcount); + batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: + if (hardif_neigh) + batadv_hardif_neigh_free_ref(hardif_neigh); return neigh_node; } /** - * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object - * @rcu: rcu pointer of the orig_ifinfo object + * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list + * @seq: neighbour table seq_file struct + * @offset: not used + * + * Always returns 0. */ -static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu) +int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) { - struct batadv_orig_ifinfo *orig_ifinfo; - struct batadv_neigh_node *router; + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; - orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu); + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + return 0; - if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) - batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing); + seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", + BATADV_SOURCE_VERSION, primary_if->net_dev->name, + primary_if->net_dev->dev_addr, net_dev->name, + bat_priv->bat_algo_ops->name); - /* this is the last reference to this object */ - router = rcu_dereference_protected(orig_ifinfo->router, true); - if (router) - batadv_neigh_node_free_ref_now(router); - kfree(orig_ifinfo); + batadv_hardif_free_ref(primary_if); + + if (!bat_priv->bat_algo_ops->bat_neigh_print) { + seq_puts(seq, + "No printing function for this routing protocol\n"); + return 0; + } + + bat_priv->bat_algo_ops->bat_neigh_print(bat_priv, seq); + return 0; } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free - * the orig_ifinfo (without rcu callback) + * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for + * free after rcu grace period * @orig_ifinfo: the orig_ifinfo object to release */ -static void -batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo) +static void batadv_orig_ifinfo_release(struct batadv_orig_ifinfo *orig_ifinfo) { - if (atomic_dec_and_test(&orig_ifinfo->refcount)) - batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu); + struct batadv_neigh_node *router; + + if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT) + batadv_hardif_free_ref(orig_ifinfo->if_outgoing); + + /* this is the last reference to this object */ + router = rcu_dereference_protected(orig_ifinfo->router, true); + if (router) + batadv_neigh_node_free_ref(router); + + kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free + * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo) { if (atomic_dec_and_test(&orig_ifinfo->refcount)) - call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu); + batadv_orig_ifinfo_release(orig_ifinfo); } +/** + * batadv_orig_node_free_rcu - free the orig_node + * @rcu: rcu pointer of the orig_node + */ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) { - struct hlist_node *node_tmp; - struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; - struct batadv_orig_ifinfo *orig_ifinfo; orig_node = container_of(rcu, struct batadv_orig_node, rcu); + batadv_mcast_purge_orig(orig_node); + + batadv_frag_purge_orig(orig_node, NULL); + + if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) + orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); + + kfree(orig_node->tt_buff); + kfree(orig_node); +} + +/** + * batadv_orig_node_release - release orig_node from lists and queue for + * free after rcu grace period + * @orig_node: the orig node to free + */ +static void batadv_orig_node_release(struct batadv_orig_node *orig_node) +{ + struct hlist_node *node_tmp; + struct batadv_neigh_node *neigh_node; + struct batadv_orig_ifinfo *orig_ifinfo; + spin_lock_bh(&orig_node->neigh_list_lock); /* for all neighbors towards this originator ... */ hlist_for_each_entry_safe(neigh_node, node_tmp, &orig_node->neigh_list, list) { hlist_del_rcu(&neigh_node->list); - batadv_neigh_node_free_ref_now(neigh_node); + batadv_neigh_node_free_ref(neigh_node); } hlist_for_each_entry_safe(orig_ifinfo, node_tmp, &orig_node->ifinfo_list, list) { hlist_del_rcu(&orig_ifinfo->list); - batadv_orig_ifinfo_free_ref_now(orig_ifinfo); + batadv_orig_ifinfo_free_ref(orig_ifinfo); } spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_mcast_purge_orig(orig_node); - /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - batadv_frag_purge_orig(orig_node, NULL); - - if (orig_node->bat_priv->bat_algo_ops->bat_orig_free) - orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node); - - kfree(orig_node->tt_buff); - kfree(orig_node); + call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } /** * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly - * schedule an rcu callback for freeing it + * release it * @orig_node: the orig node to free */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { if (atomic_dec_and_test(&orig_node->refcount)) - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); -} - -/** - * batadv_orig_node_free_ref_now - decrement the orig node refcounter and - * possibly free it (without rcu callback) - * @orig_node: the orig node to free - */ -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) -{ - if (atomic_dec_and_test(&orig_node->refcount)) - batadv_orig_node_free_rcu(&orig_node->rcu); + batadv_orig_node_release(orig_node); } void batadv_originator_free(struct batadv_priv *bat_priv) diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index fa18f9bf266b..cf0730414ed2 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -38,9 +38,13 @@ int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); -void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); +struct batadv_hardif_neigh_node * +batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); +void +batadv_hardif_neigh_free_ref(struct batadv_hardif_neigh_node *hardif_neigh); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, @@ -57,6 +61,8 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo); +int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); + struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 11f996b39fef..0558e3237e0e 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -72,8 +72,7 @@ enum batadv_subtype { * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was * previously received from someone else than the best neighbor. - * @BATADV_PRIMARIES_FIRST_HOP: flag is set when the primary interface address - * is used, and the packet travels its first hop. + * @BATADV_PRIMARIES_FIRST_HOP: flag unused. * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a * one hop neighbor on the interface where it was originally received. */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 8d990b070a2e..e4f2646d9246 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -497,9 +497,9 @@ batadv_find_router(struct batadv_priv *bat_priv, /* alternative candidate should be good enough to be * considered */ - if (!bao->bat_neigh_is_equiv_or_better(cand_router, - cand->if_outgoing, - router, recv_if)) + if (!bao->bat_neigh_is_similar_or_better(cand_router, + cand->if_outgoing, + router, recv_if)) goto next; /* don't use the same router twice */ @@ -836,6 +836,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, u8 *orig_addr; struct batadv_orig_node *orig_node = NULL; int check, hdr_size = sizeof(*unicast_packet); + enum batadv_subtype subtype; bool is4addr; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -863,10 +864,20 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* packet for me */ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { if (is4addr) { - batadv_dat_inc_counter(bat_priv, - unicast_4addr_packet->subtype); - orig_addr = unicast_4addr_packet->src; - orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + subtype = unicast_4addr_packet->subtype; + batadv_dat_inc_counter(bat_priv, subtype); + + /* Only payload data should be considered for speedy + * join. For example, DAT also uses unicast 4addr + * types, but those packets should not be considered + * for speedy join, since the clients do not actually + * reside at the sending originator. + */ + if (subtype == BATADV_P_DATA) { + orig_addr = unicast_4addr_packet->src; + orig_node = batadv_orig_hash_find(bat_priv, + orig_addr); + } } if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index f664324805eb..782fa33ec296 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -407,8 +407,7 @@ void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { - if (forw_packet->skb) - kfree_skb(forw_packet->skb); + kfree_skb(forw_packet->skb); if (forw_packet->if_incoming) batadv_hardif_free_ref(forw_packet->if_incoming); if (forw_packet->if_outgoing) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 9de3c8804ff4..fe87777fda8a 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -40,6 +40,7 @@ #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" +#include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "network-coding.h" #include "packet.h" @@ -241,10 +242,13 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ static int batadv_store_bool_attr(char *buff, size_t count, struct net_device *net_dev, - const char *attr_name, atomic_t *attr) + const char *attr_name, atomic_t *attr, + bool *changed) { int enabled = -1; + *changed = false; + if (buff[count - 1] == '\n') buff[count - 1] = '\0'; @@ -271,6 +275,8 @@ static int batadv_store_bool_attr(char *buff, size_t count, atomic_read(attr) == 1 ? "enabled" : "disabled", enabled == 1 ? "enabled" : "disabled"); + *changed = true; + atomic_set(attr, (unsigned int)enabled); return count; } @@ -281,11 +287,12 @@ __batadv_store_bool_attr(char *buff, size_t count, struct attribute *attr, atomic_t *attr_store, struct net_device *net_dev) { + bool changed; int ret; ret = batadv_store_bool_attr(buff, count, net_dev, attr->name, - attr_store); - if (post_func && ret) + attr_store, &changed); + if (post_func && changed) post_func(net_dev); return ret; @@ -549,7 +556,8 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL); BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL); #ifdef CONFIG_BATMAN_ADV_BLA -BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR, NULL); +BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR, + batadv_bla_status_update); #endif #ifdef CONFIG_BATMAN_ADV_DAT BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR, diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 4228b10c47ea..cdfc85fa2743 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -68,13 +68,15 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, unsigned short vid, const char *message, bool roaming); -/* returns 1 if they are the same mac addr */ +/* returns 1 if they are the same mac addr and vid */ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); + const struct batadv_tt_common_entry *tt1 = data1; + const struct batadv_tt_common_entry *tt2 = data2; - return batadv_compare_eth(data1, data2); + return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2); } /** @@ -238,20 +240,6 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, return count; } -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - /* We are in an rcu callback here, therefore we cannot use - * batadv_orig_node_free_ref() and its call_rcu(): - * An rcu_barrier() wouldn't wait for that to finish - */ - batadv_orig_node_free_ref_now(orig_entry->orig_node); - kfree(orig_entry); -} - /** * batadv_tt_local_size_mod - change the size by v of the local table identified * by vid @@ -347,13 +335,25 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } +/** + * batadv_tt_orig_list_entry_release - release tt orig entry from lists and + * queue for free after rcu grace period + * @orig_entry: tt orig entry to be free'd + */ +static void +batadv_tt_orig_list_entry_release(struct batadv_tt_orig_list_entry *orig_entry) +{ + batadv_orig_node_free_ref(orig_entry->orig_node); + kfree_rcu(orig_entry, rcu); +} + static void batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) { if (!atomic_dec_and_test(&orig_entry->refcount)) return; - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + batadv_tt_orig_list_entry_release(orig_entry); } /** @@ -1427,15 +1427,21 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } /* if the client was temporary added before receiving the first - * OGM announcing it, we have to clear the TEMP flag + * OGM announcing it, we have to clear the TEMP flag. Also, + * remove the previous temporary orig node and re-add it + * if required. If the orig entry changed, the new one which + * is a non-temporary entry is preferred. */ - common->flags &= ~BATADV_TT_CLIENT_TEMP; + if (common->flags & BATADV_TT_CLIENT_TEMP) { + batadv_tt_global_del_orig_list(tt_global_entry); + common->flags &= ~BATADV_TT_CLIENT_TEMP; + } /* the change can carry possible "attribute" flags like the * TT_CLIENT_WIFI, therefore they have to be copied in the * client entry */ - tt_global_entry->common.flags |= flags; + common->flags |= flags; /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a @@ -2411,8 +2417,8 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; + int i, orig_num_vlan; u32 crc; - int i; /* check if each received CRC matches the locally stored one */ for (i = 0; i < num_vlan; i++) { @@ -2438,6 +2444,18 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, return false; } + /* check if any excess VLANs exist locally for the originator + * which are not mentioned in the TVLV from the originator. + */ + rcu_read_lock(); + orig_num_vlan = 0; + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) + orig_num_vlan++; + rcu_read_unlock(); + + if (orig_num_vlan > num_vlan) + return false; + return true; } @@ -3319,7 +3337,10 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, bool ret = false; vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan || !atomic_read(&vlan->ap_isolation)) + if (!vlan) + return false; + + if (!atomic_read(&vlan->ap_isolation)) goto out; tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid); @@ -3336,8 +3357,7 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, ret = true; out: - if (vlan) - batadv_softif_vlan_free_ref(vlan); + batadv_softif_vlan_free_ref(vlan); if (tt_global_entry) batadv_tt_global_entry_free_ref(tt_global_entry); if (tt_local_entry) diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d260efd70499..3437b667a2cd 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -100,6 +100,8 @@ struct batadv_hard_iface_bat_iv { * @bat_iv: BATMAN IV specific per hard interface data * @cleanup_work: work queue callback item for hard interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs + * @neigh_list: list of unique single hop neighbors via this interface + * @neigh_list_lock: lock protecting neigh_list */ struct batadv_hard_iface { struct list_head list; @@ -115,6 +117,9 @@ struct batadv_hard_iface { struct batadv_hard_iface_bat_iv bat_iv; struct work_struct cleanup_work; struct dentry *debug_dir; + struct hlist_head neigh_list; + /* neigh_list_lock protects: neigh_list */ + spinlock_t neigh_list_lock; }; /** @@ -218,12 +223,12 @@ struct batadv_orig_bat_iv { * @orig: originator ethernet address * @ifinfo_list: list for routers per outgoing interface * @last_bonding_candidate: pointer to last ifinfo of last used router - * @batadv_dat_addr_t: address of the orig node in the distributed hash + * @dat_addr: address of the orig node in the distributed hash * @last_seen: time when last packet from this node was received * @bcast_seqno_reset: time when the broadcast seqno window was reset * @mcast_handler_lock: synchronizes mcast-capability and -flag changes * @mcast_flags: multicast flags announced by the orig node - * @mcast_want_all_unsnoop_node: a list node for the + * @mcast_want_all_unsnoopables_node: a list node for the * mcast.want_all_unsnoopables list * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list @@ -341,6 +346,23 @@ struct batadv_gw_node { }; /** + * batadv_hardif_neigh_node - unique neighbor per hard interface + * @list: list node for batadv_hard_iface::neigh_list + * @addr: the MAC address of the neighboring interface + * @if_incoming: pointer to incoming hard interface + * @refcount: number of contexts the object is used + * @rcu: struct used for freeing in a RCU-safe manner + */ +struct batadv_hardif_neigh_node { + struct hlist_node list; + u8 addr[ETH_ALEN]; + struct batadv_hard_iface *if_incoming; + unsigned long last_seen; + atomic_t refcount; + struct rcu_head rcu; +}; + +/** * struct batadv_neigh_node - structure for single hops neighbors * @list: list node for batadv_orig_node::neigh_list * @orig_node: pointer to corresponding orig_node @@ -349,9 +371,8 @@ struct batadv_gw_node { * @ifinfo_lock: lock protecting private ifinfo members and list * @if_incoming: pointer to incoming hard interface * @last_seen: when last packet via this neighbor was received - * @last_ttl: last received ttl from this neigh node + * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner - * @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_neigh_node { struct hlist_node list; @@ -401,13 +422,14 @@ struct batadv_neigh_ifinfo { struct rcu_head rcu; }; +#ifdef CONFIG_BATMAN_ADV_BLA + /** * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression - * @orig[ETH_ALEN]: mac address of orig node orginating the broadcast + * @orig: mac address of orig node orginating the broadcast * @crc: crc32 checksum of broadcast payload * @entrytime: time when the broadcast packet was received */ -#ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bcast_duplist_entry { u8 orig[ETH_ALEN]; __be32 crc; @@ -549,9 +571,11 @@ struct batadv_priv_tt { struct delayed_work work; }; +#ifdef CONFIG_BATMAN_ADV_BLA + /** * struct batadv_priv_bla - per mesh interface bridge loope avoidance data - * @num_requests; number of bla requests in flight + * @num_requests: number of bla requests in flight * @claim_hash: hash table containing mesh nodes this host has claimed * @backbone_hash: hash table containing all detected backbone gateways * @bcast_duplist: recently received broadcast packets array (for broadcast @@ -561,7 +585,6 @@ struct batadv_priv_tt { * @claim_dest: local claim data (e.g. claim group) * @work: work queue callback item for cleanups & bla announcements */ -#ifdef CONFIG_BATMAN_ADV_BLA struct batadv_priv_bla { atomic_t num_requests; struct batadv_hashtable *claim_hash; @@ -575,6 +598,8 @@ struct batadv_priv_bla { }; #endif +#ifdef CONFIG_BATMAN_ADV_DEBUG + /** * struct batadv_priv_debug_log - debug logging data * @log_buff: buffer holding the logs (ring bufer) @@ -583,7 +608,6 @@ struct batadv_priv_bla { * @lock: lock protecting log_buff, log_start & log_end * @queue_wait: log reader's wait queue */ -#ifdef CONFIG_BATMAN_ADV_DEBUG struct batadv_priv_debug_log { char log_buff[BATADV_LOG_BUF_LEN]; unsigned long log_start; @@ -625,13 +649,14 @@ struct batadv_priv_tvlv { spinlock_t handler_list_lock; /* protects handler_list */ }; +#ifdef CONFIG_BATMAN_ADV_DAT + /** * struct batadv_priv_dat - per mesh interface DAT private data * @addr: node DAT address * @hash: hashtable representing the local ARP cache * @work: work queue callback item for cache purging */ -#ifdef CONFIG_BATMAN_ADV_DAT struct batadv_priv_dat { batadv_dat_addr_t addr; struct batadv_hashtable *hash; @@ -773,7 +798,7 @@ struct batadv_softif_vlan { * @dat: distributed arp table data * @mcast: multicast data * @network_coding: bool indicating whether network coding is enabled - * @batadv_priv_nc: network coding data + * @nc: network coding data */ struct batadv_priv { atomic_t mesh_state; @@ -871,6 +896,8 @@ struct batadv_socket_packet { u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; }; +#ifdef CONFIG_BATMAN_ADV_BLA + /** * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN * @orig: originator address of backbone node (mac address of primary iface) @@ -884,10 +911,10 @@ struct batadv_socket_packet { * backbone gateway - no bcast traffic is formwared until the situation was * resolved * @crc: crc16 checksum over all claims + * @crc_lock: lock protecting crc * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ -#ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bla_backbone_gw { u8 orig[ETH_ALEN]; unsigned short vid; @@ -897,6 +924,7 @@ struct batadv_bla_backbone_gw { atomic_t wait_periods; atomic_t request_sent; u16 crc; + spinlock_t crc_lock; /* protects crc */ atomic_t refcount; struct rcu_head rcu; }; @@ -905,7 +933,7 @@ struct batadv_bla_backbone_gw { * struct batadv_bla_claim - claimed non-mesh client structure * @addr: mac address of claimed non-mesh client * @vid: vlan id this client was detected on - * @batadv_bla_backbone_gw: pointer to backbone gw claiming this client + * @backbone_gw: pointer to backbone gw claiming this client * @lasttime: last time we heard of claim (locals only) * @hash_entry: hlist node for batadv_priv_bla::claim_hash * @refcount: number of contexts the object is used @@ -1131,11 +1159,13 @@ struct batadv_forw_packet { * @bat_primary_iface_set: called when primary interface is selected / changed * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue * @bat_ogm_emit: send scheduled OGM + * @bat_hardif_neigh_init: called on creation of single hop entry * @bat_neigh_cmp: compare the metrics of two neighbors for their respective * outgoing interfaces - * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better - * than neigh2 for their respective outgoing interface from the metric + * @bat_neigh_is_similar_or_better: check if neigh1 is equally similar or + * better than neigh2 for their respective outgoing interface from the metric * prospective + * @bat_neigh_print: print the single hop neighbor list (optional) * @bat_neigh_free: free the resources allocated by the routing algorithm for a * neigh_node object * @bat_orig_print: print the originator table (optional) @@ -1156,15 +1186,17 @@ struct batadv_algo_ops { void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface); void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet); /* neigh_node handling API */ + void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh); int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); - bool (*bat_neigh_is_equiv_or_better) + bool (*bat_neigh_is_similar_or_better) (struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); + void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq); void (*bat_neigh_free)(struct batadv_neigh_node *neigh); /* orig_node handling API */ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq, @@ -1224,8 +1256,6 @@ struct batadv_dat_candidate { * struct batadv_tvlv_container - container for tvlv appended to OGMs * @list: hlist node for batadv_priv_tvlv::container_list * @tvlv_hdr: tvlv header information needed to construct the tvlv - * @value_len: length of the buffer following this struct which contains - * the actual tvlv payload * @refcount: number of contexts the object is used */ struct batadv_tvlv_container { diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 9e9cca3689a0..d040365ba98e 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -825,9 +825,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); - lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE); - - err = register_netdev(netdev); + err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE); if (err < 0) { BT_INFO("register_netdev failed %d", err); spin_lock(&devices_lock); @@ -890,7 +888,7 @@ static void delete_netdev(struct work_struct *work) struct lowpan_dev *entry = container_of(work, struct lowpan_dev, delete_netdev); - unregister_netdev(entry->netdev); + lowpan_unregister_netdev(entry->netdev); /* The entry pointer is deleted by the netdev destructor. */ } @@ -1348,7 +1346,7 @@ static void disconnect_devices(void) ifdown(entry->netdev); BT_DBG("Unregistering netdev %s %p", entry->netdev->name, entry->netdev); - unregister_netdev(entry->netdev); + lowpan_unregister_netdev(entry->netdev); kfree(entry); } } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index a3bffd1ec2b4..955eda93e66f 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -33,8 +33,6 @@ #include "selftest.h" -#define VERSION "2.21" - /* Bluetooth sockets */ #define BT_MAX_PROTO 8 static const struct net_proto_family *bt_proto[BT_MAX_PROTO]; @@ -176,20 +174,20 @@ EXPORT_SYMBOL(bt_accept_unlink); struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { - struct list_head *p, *n; + struct bt_sock *s, *n; struct sock *sk; BT_DBG("parent %p", parent); - list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { - sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { + sk = (struct sock *)s; lock_sock(sk); /* FIXME: Is this check still needed */ if (sk->sk_state == BT_CLOSED) { - release_sock(sk); bt_accept_unlink(sk); + release_sock(sk); continue; } @@ -271,11 +269,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo) if (signal_pending(current) || !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); @@ -390,11 +388,11 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline unsigned int bt_accept_poll(struct sock *parent) { - struct list_head *p, *n; + struct bt_sock *s, *n; struct sock *sk; - list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { - sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { + sk = (struct sock *)s; if (sk->sk_state == BT_CONNECTED || (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && sk->sk_state == BT_CONNECT2)) @@ -441,7 +439,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } @@ -671,7 +669,7 @@ static const struct file_operations bt_fops = { }; int bt_procfs_init(struct net *net, const char *name, - struct bt_sock_list* sk_list, + struct bt_sock_list *sk_list, int (* seq_show)(struct seq_file *, void *)) { sk_list->custom_seq_show = seq_show; @@ -687,7 +685,7 @@ void bt_procfs_cleanup(struct net *net, const char *name) } #else int bt_procfs_init(struct net *net, const char *name, - struct bt_sock_list* sk_list, + struct bt_sock_list *sk_list, int (* seq_show)(struct seq_file *, void *)) { return 0; @@ -715,7 +713,7 @@ static int __init bt_init(void) sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); - BT_INFO("Core ver %s", VERSION); + BT_INFO("Core ver %s", BT_SUBSYS_VERSION); err = bt_selftest(); if (err < 0) @@ -789,7 +787,7 @@ subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); -MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); -MODULE_VERSION(VERSION); +MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION); +MODULE_VERSION(BT_SUBSYS_VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 1641367e54ca..fbf251fef70f 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -608,8 +608,11 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) s->msg.msg_flags = MSG_NOSIGNAL; #ifdef CONFIG_BT_BNEP_MC_FILTER - /* Set default mc filter */ - set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter); + /* Set default mc filter to not filter out any mc addresses + * as defined in the BNEP specification (revision 0.95a) + * http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf + */ + s->mc_filter = ~0LL; #endif #ifdef CONFIG_BT_BNEP_PROTO_FILTER diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index 9a50338772f3..46ac686c8911 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -100,10 +100,8 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value) { struct cmtp_application *app; - struct list_head *p; - list_for_each(p, &session->applications) { - app = list_entry(p, struct cmtp_application, list); + list_for_each_entry(app, &session->applications, list) { switch (pattern) { case CMTP_MSGNUM: if (app->msgnum == value) @@ -511,14 +509,12 @@ static int cmtp_proc_show(struct seq_file *m, void *v) struct capi_ctr *ctrl = m->private; struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *app; - struct list_head *p; seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); seq_printf(m, "addr %s\n", session->name); seq_printf(m, "ctrl %d\n", session->num); - list_for_each(p, &session->applications) { - app = list_entry(p, struct cmtp_application, list); + list_for_each_entry(app, &session->applications, list) { seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 298ed37010e6..9e59b6654126 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -178,8 +178,7 @@ static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff * cmtp_add_msgpart(session, id, skb->data + hdrlen, len); break; default: - if (session->reassembly[id] != NULL) - kfree_skb(session->reassembly[id]); + kfree_skb(session->reassembly[id]); session->reassembly[id] = NULL; break; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 85b82f7adbd2..32575b49f4a0 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -178,6 +178,10 @@ static void hci_connect_le_scan_remove(struct hci_conn *conn) hci_dev_hold(conn->hdev); hci_conn_get(conn); + /* Even though we hold a reference to the hdev, many other + * things might get cleaned up meanwhile, including the hdev's + * own workqueue, so we can't use that for scheduling. + */ schedule_work(&conn->le_scan_cleanup); } @@ -664,8 +668,16 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) conn->state = BT_CLOSED; - mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, - status); + /* If the status indicates successful cancellation of + * the attempt (i.e. Unkown Connection Id) there's no point of + * notifying failure since we'll go back to keep trying to + * connect. The only exception is explicit connect requests + * where a timeout + cancel does indicate an actual failure. + */ + if (status != HCI_ERROR_UNKNOWN_CONN_ID || + (params && params->explicit_connect)) + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); hci_connect_cfm(conn, status); @@ -679,7 +691,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) /* Re-enable advertising in case this was a failed connection * attempt as a peripheral. */ - mgmt_reenable_advertising(hdev); + hci_req_reenable_advertising(hdev); } static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) @@ -722,8 +734,12 @@ static void hci_req_add_le_create_conn(struct hci_request *req, if (hci_update_random_address(req, false, &own_addr_type)) return; + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cpu_to_le16(hdev->le_scan_window); + cp.scan_window = cp.scan_interval; + bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; @@ -781,7 +797,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 role) { struct hci_conn_params *params; - struct hci_conn *conn, *conn_unfinished; + struct hci_conn *conn; struct smp_irk *irk; struct hci_request req; int err; @@ -794,35 +810,22 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EOPNOTSUPP); } - /* Some devices send ATT messages as soon as the physical link is - * established. To be able to handle these ATT messages, the user- - * space first establishes the connection and then starts the pairing - * process. - * - * So if a hci_conn object already exists for the following connection - * attempt, we simply update pending_sec_level and auth_type fields - * and return the object found. - */ - conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); - conn_unfinished = NULL; - if (conn) { - if (conn->state == BT_CONNECT && - test_bit(HCI_CONN_SCANNING, &conn->flags)) { - BT_DBG("will continue unfinished conn %pMR", dst); - conn_unfinished = conn; - } else { - if (conn->pending_sec_level < sec_level) - conn->pending_sec_level = sec_level; - goto done; - } - } - /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); + /* If there's already a connection object but it's not in + * scanning state it means it must already be established, in + * which case we can't do anything else except report a failure + * to connect. + */ + conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); + if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) { + return ERR_PTR(-EBUSY); + } + /* When given an identity address with existing identity * resolving key, the connection needs to be established * to a resolvable random address. @@ -838,23 +841,20 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, dst_type = ADDR_LE_DEV_RANDOM; } - if (conn_unfinished) { - conn = conn_unfinished; + if (conn) { bacpy(&conn->dst, dst); } else { conn = hci_conn_add(hdev, LE_LINK, dst, role); + if (!conn) + return ERR_PTR(-ENOMEM); + hci_conn_hold(conn); + conn->pending_sec_level = sec_level; } - if (!conn) - return ERR_PTR(-ENOMEM); - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; - if (!conn_unfinished) - conn->pending_sec_level = sec_level; - hci_req_init(&req, hdev); /* Disable advertising if we're active. For master role @@ -918,37 +918,9 @@ create_conn: return ERR_PTR(err); } -done: - /* If this is continuation of connect started by hci_connect_le_scan, - * it already called hci_conn_hold and calling it again would mess the - * counter. - */ - if (!conn_unfinished) - hci_conn_hold(conn); - return conn; } -static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - struct hci_conn *conn; - - if (!status) - return; - - BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x", - status); - - hci_dev_lock(hdev); - - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); - if (conn) - hci_le_conn_failed(conn, status); - - hci_dev_unlock(hdev); -} - static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) { struct hci_conn *conn; @@ -964,10 +936,9 @@ static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) } /* This function requires the caller holds hdev->lock */ -static int hci_explicit_conn_params_set(struct hci_request *req, +static int hci_explicit_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { - struct hci_dev *hdev = req->hdev; struct hci_conn_params *params; if (is_connected(hdev, addr, addr_type)) @@ -995,7 +966,6 @@ static int hci_explicit_conn_params_set(struct hci_request *req, } params->explicit_connect = true; - __hci_update_background_scan(req); BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, params->auto_connect); @@ -1006,11 +976,9 @@ static int hci_explicit_conn_params_set(struct hci_request *req, /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, - u16 conn_timeout, u8 role) + u16 conn_timeout) { struct hci_conn *conn; - struct hci_request req; - int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { @@ -1038,29 +1006,22 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, BT_DBG("requesting refresh of dst_addr"); - conn = hci_conn_add(hdev, LE_LINK, dst, role); + conn = hci_conn_add(hdev, LE_LINK, dst, HCI_ROLE_MASTER); if (!conn) return ERR_PTR(-ENOMEM); - hci_req_init(&req, hdev); - - if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0) + if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) return ERR_PTR(-EBUSY); conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); - - err = hci_req_run(&req, hci_connect_le_scan_complete); - if (err && err != -ENODATA) { - hci_conn_del(conn); - return ERR_PTR(err); - } - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; + hci_update_background_scan(hdev); + done: hci_conn_hold(conn); return conn; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 62edbf1b114e..47bcef754796 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -56,15 +56,6 @@ DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); -/* ----- HCI requests ----- */ - -#define HCI_REQ_DONE 0 -#define HCI_REQ_PEND 1 -#define HCI_REQ_CANCELED 2 - -#define hci_req_lock(d) mutex_lock(&d->req_lock) -#define hci_req_unlock(d) mutex_unlock(&d->req_lock) - /* ---- HCI debugfs entries ---- */ static ssize_t dut_mode_read(struct file *file, char __user *user_buf, @@ -73,7 +64,7 @@ static ssize_t dut_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -101,14 +92,14 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) return -EALREADY; - hci_req_lock(hdev); + hci_req_sync_lock(hdev); if (enable) skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL, HCI_CMD_TIMEOUT); else skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_CMD_TIMEOUT); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -133,7 +124,7 @@ static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -165,9 +156,9 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, !test_bit(HCI_RUNNING, &hdev->flags)) goto done; - hci_req_lock(hdev); + hci_req_sync_lock(hdev); err = hdev->set_diag(hdev, enable); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); if (err < 0) return err; @@ -198,197 +189,14 @@ static void hci_debugfs_create_basic(struct hci_dev *hdev) &vendor_diag_fops); } -/* ---- HCI requests ---- */ - -static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, - struct sk_buff *skb) -{ - BT_DBG("%s result 0x%2.2x", hdev->name, result); - - if (hdev->req_status == HCI_REQ_PEND) { - hdev->req_result = result; - hdev->req_status = HCI_REQ_DONE; - if (skb) - hdev->req_skb = skb_get(skb); - wake_up_interruptible(&hdev->req_wait_q); - } -} - -static void hci_req_cancel(struct hci_dev *hdev, int err) -{ - BT_DBG("%s err 0x%2.2x", hdev->name, err); - - if (hdev->req_status == HCI_REQ_PEND) { - hdev->req_result = err; - hdev->req_status = HCI_REQ_CANCELED; - wake_up_interruptible(&hdev->req_wait_q); - } -} - -struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, - const void *param, u8 event, u32 timeout) -{ - DECLARE_WAITQUEUE(wait, current); - struct hci_request req; - struct sk_buff *skb; - int err = 0; - - BT_DBG("%s", hdev->name); - - hci_req_init(&req, hdev); - - hci_req_add_ev(&req, opcode, plen, param, event); - - hdev->req_status = HCI_REQ_PEND; - - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - err = hci_req_run_skb(&req, hci_req_sync_complete); - if (err < 0) { - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); - return ERR_PTR(err); - } - - schedule_timeout(timeout); - - remove_wait_queue(&hdev->req_wait_q, &wait); - - if (signal_pending(current)) - return ERR_PTR(-EINTR); - - switch (hdev->req_status) { - case HCI_REQ_DONE: - err = -bt_to_errno(hdev->req_result); - break; - - case HCI_REQ_CANCELED: - err = -hdev->req_result; - break; - - default: - err = -ETIMEDOUT; - break; - } - - hdev->req_status = hdev->req_result = 0; - skb = hdev->req_skb; - hdev->req_skb = NULL; - - BT_DBG("%s end: err %d", hdev->name, err); - - if (err < 0) { - kfree_skb(skb); - return ERR_PTR(err); - } - - if (!skb) - return ERR_PTR(-ENODATA); - - return skb; -} -EXPORT_SYMBOL(__hci_cmd_sync_ev); - -struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, - const void *param, u32 timeout) -{ - return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout); -} -EXPORT_SYMBOL(__hci_cmd_sync); - -/* Execute request and wait for completion. */ -static int __hci_req_sync(struct hci_dev *hdev, - void (*func)(struct hci_request *req, - unsigned long opt), - unsigned long opt, __u32 timeout) -{ - struct hci_request req; - DECLARE_WAITQUEUE(wait, current); - int err = 0; - - BT_DBG("%s start", hdev->name); - - hci_req_init(&req, hdev); - - hdev->req_status = HCI_REQ_PEND; - - func(&req, opt); - - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - - err = hci_req_run_skb(&req, hci_req_sync_complete); - if (err < 0) { - hdev->req_status = 0; - - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); - - /* ENODATA means the HCI request command queue is empty. - * This can happen when a request with conditionals doesn't - * trigger any commands to be sent. This is normal behavior - * and should not trigger an error return. - */ - if (err == -ENODATA) - return 0; - - return err; - } - - schedule_timeout(timeout); - - remove_wait_queue(&hdev->req_wait_q, &wait); - - if (signal_pending(current)) - return -EINTR; - - switch (hdev->req_status) { - case HCI_REQ_DONE: - err = -bt_to_errno(hdev->req_result); - break; - - case HCI_REQ_CANCELED: - err = -hdev->req_result; - break; - - default: - err = -ETIMEDOUT; - break; - } - - hdev->req_status = hdev->req_result = 0; - - BT_DBG("%s end: err %d", hdev->name, err); - - return err; -} - -static int hci_req_sync(struct hci_dev *hdev, - void (*req)(struct hci_request *req, - unsigned long opt), - unsigned long opt, __u32 timeout) -{ - int ret; - - if (!test_bit(HCI_UP, &hdev->flags)) - return -ENETDOWN; - - /* Serialize all requests */ - hci_req_lock(hdev); - ret = __hci_req_sync(hdev, req, opt, timeout); - hci_req_unlock(hdev); - - return ret; -} - -static void hci_reset_req(struct hci_request *req, unsigned long opt) +static int hci_reset_req(struct hci_request *req, unsigned long opt) { BT_DBG("%s %ld", req->hdev->name, opt); /* Reset device */ set_bit(HCI_RESET, &req->hdev->flags); hci_req_add(req, HCI_OP_RESET, 0, NULL); + return 0; } static void bredr_init(struct hci_request *req) @@ -428,7 +236,7 @@ static void amp_init1(struct hci_request *req) hci_req_add(req, HCI_OP_READ_LOCATION_DATA, 0, NULL); } -static void amp_init2(struct hci_request *req) +static int amp_init2(struct hci_request *req) { /* Read Local Supported Features. Not all AMP controllers * support this so it's placed conditionally in the second @@ -436,9 +244,11 @@ static void amp_init2(struct hci_request *req) */ if (req->hdev->commands[14] & 0x20) hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL); + + return 0; } -static void hci_init1_req(struct hci_request *req, unsigned long opt) +static int hci_init1_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; @@ -461,6 +271,8 @@ static void hci_init1_req(struct hci_request *req, unsigned long opt) BT_ERR("Unknown device type %d", hdev->dev_type); break; } + + return 0; } static void bredr_setup(struct hci_request *req) @@ -531,20 +343,30 @@ static void hci_setup_event_mask(struct hci_request *req) if (lmp_bredr_capable(hdev)) { events[4] |= 0x01; /* Flow Specification Complete */ - events[4] |= 0x02; /* Inquiry Result with RSSI */ - events[4] |= 0x04; /* Read Remote Extended Features Complete */ - events[5] |= 0x08; /* Synchronous Connection Complete */ - events[5] |= 0x10; /* Synchronous Connection Changed */ } else { /* Use a different default for LE-only devices */ memset(events, 0, sizeof(events)); - events[0] |= 0x10; /* Disconnection Complete */ - events[1] |= 0x08; /* Read Remote Version Information Complete */ events[1] |= 0x20; /* Command Complete */ events[1] |= 0x40; /* Command Status */ events[1] |= 0x80; /* Hardware Error */ - events[2] |= 0x04; /* Number of Completed Packets */ - events[3] |= 0x02; /* Data Buffer Overflow */ + + /* If the controller supports the Disconnect command, enable + * the corresponding event. In addition enable packet flow + * control related events. + */ + if (hdev->commands[0] & 0x20) { + events[0] |= 0x10; /* Disconnection Complete */ + events[2] |= 0x04; /* Number of Completed Packets */ + events[3] |= 0x02; /* Data Buffer Overflow */ + } + + /* If the controller supports the Read Remote Version + * Information command, enable the corresponding event. + */ + if (hdev->commands[2] & 0x80) + events[1] |= 0x08; /* Read Remote Version Information + * Complete + */ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) { events[0] |= 0x80; /* Encryption Change */ @@ -552,9 +374,18 @@ static void hci_setup_event_mask(struct hci_request *req) } } - if (lmp_inq_rssi_capable(hdev)) + if (lmp_inq_rssi_capable(hdev) || + test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) events[4] |= 0x02; /* Inquiry Result with RSSI */ + if (lmp_ext_feat_capable(hdev)) + events[4] |= 0x04; /* Read Remote Extended Features Complete */ + + if (lmp_esco_capable(hdev)) { + events[5] |= 0x08; /* Synchronous Connection Complete */ + events[5] |= 0x10; /* Synchronous Connection Changed */ + } + if (lmp_sniffsubr_capable(hdev)) events[5] |= 0x20; /* Sniff Subrating */ @@ -590,7 +421,7 @@ static void hci_setup_event_mask(struct hci_request *req) hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events); } -static void hci_init2_req(struct hci_request *req, unsigned long opt) +static int hci_init2_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; @@ -670,6 +501,8 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable), &enable); } + + return 0; } static void hci_setup_link_policy(struct hci_request *req) @@ -744,7 +577,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req) hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events); } -static void hci_init3_req(struct hci_request *req, unsigned long opt) +static int hci_init3_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; u8 p; @@ -777,7 +610,6 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) u8 events[8]; memset(events, 0, sizeof(events)); - events[0] = 0x0f; if (hdev->le_features[0] & HCI_LE_ENCRYPTION) events[0] |= 0x10; /* LE Long Term Key Request */ @@ -804,6 +636,34 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ + /* If the controller supports the LE Set Scan Enable command, + * enable the corresponding advertising report event. + */ + if (hdev->commands[26] & 0x08) + events[0] |= 0x02; /* LE Advertising Report */ + + /* If the controller supports the LE Create Connection + * command, enable the corresponding event. + */ + if (hdev->commands[26] & 0x10) + events[0] |= 0x01; /* LE Connection Complete */ + + /* If the controller supports the LE Connection Update + * command, enable the corresponding event. + */ + if (hdev->commands[27] & 0x04) + events[0] |= 0x04; /* LE Connection Update + * Complete + */ + + /* If the controller supports the LE Read Remote Used Features + * command, enable the corresponding event. + */ + if (hdev->commands[27] & 0x20) + events[0] |= 0x08; /* LE Read Remote Used + * Features Complete + */ + /* If the controller supports the LE Read Local P-256 * Public Key command, enable the corresponding event. */ @@ -856,9 +716,11 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp); } + + return 0; } -static void hci_init4_req(struct hci_request *req, unsigned long opt) +static int hci_init4_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; @@ -909,20 +771,22 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT, sizeof(support), &support); } + + return 0; } static int __hci_init(struct hci_dev *hdev) { int err; - err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT); + err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT, NULL); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); - err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT); + err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT, NULL); if (err < 0) return err; @@ -933,11 +797,11 @@ static int __hci_init(struct hci_dev *hdev) if (hdev->dev_type != HCI_BREDR) return 0; - err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT); + err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL); if (err < 0) return err; - err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT); + err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT, NULL); if (err < 0) return err; @@ -968,7 +832,7 @@ static int __hci_init(struct hci_dev *hdev) return 0; } -static void hci_init0_req(struct hci_request *req, unsigned long opt) +static int hci_init0_req(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; @@ -984,6 +848,8 @@ static void hci_init0_req(struct hci_request *req, unsigned long opt) /* Read BD Address */ if (hdev->set_bdaddr) hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL); + + return 0; } static int __hci_unconf_init(struct hci_dev *hdev) @@ -993,7 +859,7 @@ static int __hci_unconf_init(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return 0; - err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT); + err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT, NULL); if (err < 0) return err; @@ -1003,7 +869,7 @@ static int __hci_unconf_init(struct hci_dev *hdev) return 0; } -static void hci_scan_req(struct hci_request *req, unsigned long opt) +static int hci_scan_req(struct hci_request *req, unsigned long opt) { __u8 scan = opt; @@ -1011,9 +877,10 @@ static void hci_scan_req(struct hci_request *req, unsigned long opt) /* Inquiry and Page scans */ hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + return 0; } -static void hci_auth_req(struct hci_request *req, unsigned long opt) +static int hci_auth_req(struct hci_request *req, unsigned long opt) { __u8 auth = opt; @@ -1021,9 +888,10 @@ static void hci_auth_req(struct hci_request *req, unsigned long opt) /* Authentication */ hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); + return 0; } -static void hci_encrypt_req(struct hci_request *req, unsigned long opt) +static int hci_encrypt_req(struct hci_request *req, unsigned long opt) { __u8 encrypt = opt; @@ -1031,9 +899,10 @@ static void hci_encrypt_req(struct hci_request *req, unsigned long opt) /* Encryption */ hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); + return 0; } -static void hci_linkpol_req(struct hci_request *req, unsigned long opt) +static int hci_linkpol_req(struct hci_request *req, unsigned long opt) { __le16 policy = cpu_to_le16(opt); @@ -1041,6 +910,7 @@ static void hci_linkpol_req(struct hci_request *req, unsigned long opt) /* Default link policy */ hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); + return 0; } /* Get HCI device by index. @@ -1285,7 +1155,7 @@ static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) return copied; } -static void hci_inq_req(struct hci_request *req, unsigned long opt) +static int hci_inq_req(struct hci_request *req, unsigned long opt) { struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt; struct hci_dev *hdev = req->hdev; @@ -1294,13 +1164,15 @@ static void hci_inq_req(struct hci_request *req, unsigned long opt) BT_DBG("%s", hdev->name); if (test_bit(HCI_INQUIRY, &hdev->flags)) - return; + return 0; /* Start Inquiry */ memcpy(&cp.lap, &ir->lap, 3); cp.length = ir->length; cp.num_rsp = ir->num_rsp; hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); + + return 0; } int hci_inquiry(void __user *arg) @@ -1351,7 +1223,7 @@ int hci_inquiry(void __user *arg) if (do_inquiry) { err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir, - timeo); + timeo, NULL); if (err < 0) goto done; @@ -1404,7 +1276,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) BT_DBG("%s %p", hdev->name, hdev); - hci_req_lock(hdev); + hci_req_sync_lock(hdev); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { ret = -ENODEV; @@ -1527,10 +1399,10 @@ static int hci_dev_do_open(struct hci_dev *hdev) !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + hci_dev_test_flag(hdev, HCI_MGMT) && hdev->dev_type == HCI_BREDR) { - hci_dev_lock(hdev); - mgmt_powered(hdev, 1); - hci_dev_unlock(hdev); + ret = __hci_req_hci_power_on(hdev); + mgmt_power_on(hdev, ret); } } else { /* Init failed, cleanup */ @@ -1557,7 +1429,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) } done: - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); return ret; } @@ -1651,12 +1523,12 @@ int hci_dev_do_close(struct hci_dev *hdev) cancel_delayed_work(&hdev->power_off); - hci_req_cancel(hdev, ENODEV); - hci_req_lock(hdev); + hci_request_cancel_all(hdev); + hci_req_sync_lock(hdev); if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); return 0; } @@ -1665,7 +1537,6 @@ int hci_dev_do_close(struct hci_dev *hdev) flush_work(&hdev->rx_work); if (hdev->discov_timeout > 0) { - cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = 0; hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); @@ -1674,17 +1545,9 @@ int hci_dev_do_close(struct hci_dev *hdev) if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); - cancel_delayed_work_sync(&hdev->le_scan_disable); - cancel_delayed_work_sync(&hdev->le_scan_restart); - if (hci_dev_test_flag(hdev, HCI_MGMT)) cancel_delayed_work_sync(&hdev->rpa_expired); - if (hdev->adv_instance_timeout) { - cancel_delayed_work_sync(&hdev->adv_instance_expire); - hdev->adv_instance_timeout = 0; - } - /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -1696,8 +1559,9 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); - if (!auto_off && hdev->dev_type == HCI_BREDR) - mgmt_powered(hdev, 0); + if (!auto_off && hdev->dev_type == HCI_BREDR && + hci_dev_test_flag(hdev, HCI_MGMT)) + __mgmt_power_off(hdev); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); @@ -1717,7 +1581,7 @@ int hci_dev_do_close(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); - __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT); + __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT, NULL); clear_bit(HCI_INIT, &hdev->flags); } @@ -1754,7 +1618,7 @@ int hci_dev_do_close(struct hci_dev *hdev) memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); hci_dev_put(hdev); return 0; @@ -1790,7 +1654,7 @@ static int hci_dev_do_reset(struct hci_dev *hdev) BT_DBG("%s %p", hdev->name, hdev); - hci_req_lock(hdev); + hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); @@ -1812,9 +1676,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; - ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); + ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT, NULL); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); return ret; } @@ -1905,7 +1769,7 @@ static void hci_update_scan_state(struct hci_dev *hdev, u8 scan) hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - mgmt_update_adv_data(hdev); + hci_req_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } @@ -1947,7 +1811,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) switch (cmd) { case HCISETAUTH: err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + HCI_INIT_TIMEOUT, NULL); break; case HCISETENCRYPT: @@ -1959,18 +1823,18 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + HCI_INIT_TIMEOUT, NULL); if (err) break; } err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + HCI_INIT_TIMEOUT, NULL); break; case HCISETSCAN: err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + HCI_INIT_TIMEOUT, NULL); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. @@ -1981,7 +1845,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) case HCISETLINKPOL: err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt, - HCI_INIT_TIMEOUT); + HCI_INIT_TIMEOUT, NULL); break; case HCISETLINKMODE: @@ -2150,6 +2014,16 @@ static void hci_power_on(struct work_struct *work) BT_DBG("%s", hdev->name); + if (test_bit(HCI_UP, &hdev->flags) && + hci_dev_test_flag(hdev, HCI_MGMT) && + hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { + hci_req_sync_lock(hdev); + err = __hci_req_hci_power_on(hdev); + hci_req_sync_unlock(hdev); + mgmt_power_on(hdev, err); + return; + } + err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); @@ -2232,28 +2106,6 @@ static void hci_error_reset(struct work_struct *work) hci_dev_do_open(hdev); } -static void hci_discov_off(struct work_struct *work) -{ - struct hci_dev *hdev; - - hdev = container_of(work, struct hci_dev, discov_off.work); - - BT_DBG("%s", hdev->name); - - mgmt_discoverable_timeout(hdev); -} - -static void hci_adv_timeout_expire(struct work_struct *work) -{ - struct hci_dev *hdev; - - hdev = container_of(work, struct hci_dev, adv_instance_expire.work); - - BT_DBG("%s", hdev->name); - - mgmt_adv_timeout_expired(hdev); -} - void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; @@ -2731,7 +2583,8 @@ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) } /* This function requires the caller holds hdev->lock */ -struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { +struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) +{ struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); @@ -2757,9 +2610,12 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) BT_DBG("%s removing %dMR", hdev->name, instance); - if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) { - cancel_delayed_work(&hdev->adv_instance_expire); - hdev->adv_instance_timeout = 0; + if (hdev->cur_adv_instance == instance) { + if (hdev->adv_instance_timeout) { + cancel_delayed_work(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } + hdev->cur_adv_instance = 0x00; } list_del(&adv_instance->list); @@ -2786,6 +2642,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev) } hdev->adv_instance_cnt = 0; + hdev->cur_adv_instance = 0x00; } /* This function requires the caller holds hdev->lock */ @@ -2856,12 +2713,10 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { - struct list_head *p, *n; + struct bdaddr_list *b, *n; - list_for_each_safe(p, n, bdaddr_list) { - struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list); - - list_del(p); + list_for_each_entry_safe(b, n, bdaddr_list, list) { + list_del(&b->list); kfree(b); } } @@ -3024,181 +2879,16 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) } /* This function requires the caller holds hdev->lock */ -void hci_conn_params_clear_all(struct hci_dev *hdev) +static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); - hci_update_background_scan(hdev); - BT_DBG("All LE connection parameters were removed"); } -static void inquiry_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - if (status) { - BT_ERR("Failed to start inquiry: status %d", status); - - hci_dev_lock(hdev); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - hci_dev_unlock(hdev); - return; - } -} - -static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - /* General inquiry access code (GIAC) */ - u8 lap[3] = { 0x33, 0x8b, 0x9e }; - struct hci_cp_inquiry cp; - int err; - - if (status) { - BT_ERR("Failed to disable LE scanning: status %d", status); - return; - } - - hdev->discovery.scan_start = 0; - - switch (hdev->discovery.type) { - case DISCOV_TYPE_LE: - hci_dev_lock(hdev); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - hci_dev_unlock(hdev); - break; - - case DISCOV_TYPE_INTERLEAVED: - hci_dev_lock(hdev); - - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, - &hdev->quirks)) { - /* If we were running LE only scan, change discovery - * state. If we were running both LE and BR/EDR inquiry - * simultaneously, and BR/EDR inquiry is already - * finished, stop discovery, otherwise BR/EDR inquiry - * will stop discovery when finished. If we will resolve - * remote device name, do not change discovery state. - */ - if (!test_bit(HCI_INQUIRY, &hdev->flags) && - hdev->discovery.state != DISCOVERY_RESOLVING) - hci_discovery_set_state(hdev, - DISCOVERY_STOPPED); - } else { - struct hci_request req; - - hci_inquiry_cache_flush(hdev); - - hci_req_init(&req, hdev); - - memset(&cp, 0, sizeof(cp)); - memcpy(&cp.lap, lap, sizeof(cp.lap)); - cp.length = DISCOV_INTERLEAVED_INQUIRY_LEN; - hci_req_add(&req, HCI_OP_INQUIRY, sizeof(cp), &cp); - - err = hci_req_run(&req, inquiry_complete); - if (err) { - BT_ERR("Inquiry request failed: err %d", err); - hci_discovery_set_state(hdev, - DISCOVERY_STOPPED); - } - } - - hci_dev_unlock(hdev); - break; - } -} - -static void le_scan_disable_work(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - le_scan_disable.work); - struct hci_request req; - int err; - - BT_DBG("%s", hdev->name); - - cancel_delayed_work_sync(&hdev->le_scan_restart); - - hci_req_init(&req, hdev); - - hci_req_add_le_scan_disable(&req); - - err = hci_req_run(&req, le_scan_disable_work_complete); - if (err) - BT_ERR("Disable LE scanning request failed: err %d", err); -} - -static void le_scan_restart_work_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - unsigned long timeout, duration, scan_start, now; - - BT_DBG("%s", hdev->name); - - if (status) { - BT_ERR("Failed to restart LE scan: status %d", status); - return; - } - - if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || - !hdev->discovery.scan_start) - return; - - /* When the scan was started, hdev->le_scan_disable has been queued - * after duration from scan_start. During scan restart this job - * has been canceled, and we need to queue it again after proper - * timeout, to make sure that scan does not run indefinitely. - */ - duration = hdev->discovery.scan_duration; - scan_start = hdev->discovery.scan_start; - now = jiffies; - if (now - scan_start <= duration) { - int elapsed; - - if (now >= scan_start) - elapsed = now - scan_start; - else - elapsed = ULONG_MAX - scan_start + now; - - timeout = duration - elapsed; - } else { - timeout = 0; - } - queue_delayed_work(hdev->workqueue, - &hdev->le_scan_disable, timeout); -} - -static void le_scan_restart_work(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - le_scan_restart.work); - struct hci_request req; - struct hci_cp_le_set_scan_enable cp; - int err; - - BT_DBG("%s", hdev->name); - - /* If controller is not scanning we are done. */ - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return; - - hci_req_init(&req, hdev); - - hci_req_add_le_scan_disable(&req); - - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_ENABLE; - cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(&req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); - - err = hci_req_run(&req, le_scan_restart_work_complete); - if (err) - BT_ERR("Restart LE scan request failed: err %d", err); -} - /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. @@ -3297,10 +2987,6 @@ struct hci_dev *hci_alloc_dev(void) INIT_WORK(&hdev->error_reset, hci_error_reset); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); - INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off); - INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); - INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); - INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); @@ -3310,6 +2996,8 @@ struct hci_dev *hci_alloc_dev(void) INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); + hci_request_setup(hdev); + hci_init_sysfs(hdev); discovery_init(hdev); @@ -3520,7 +3208,7 @@ int hci_reset_dev(struct hci_dev *hdev) if (!skb) return -ENOMEM; - bt_cb(skb)->pkt_type = HCI_EVENT_PKT; + hci_skb_pkt_type(skb) = HCI_EVENT_PKT; memcpy(skb_put(skb, 3), hw_err, 3); /* Send Hardware Error to upper stack */ @@ -3537,9 +3225,9 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) return -ENXIO; } - if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT && - bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && + hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { kfree_skb(skb); return -EINVAL; } @@ -3561,7 +3249,7 @@ EXPORT_SYMBOL(hci_recv_frame); int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ - bt_cb(skb)->pkt_type = HCI_DIAG_PKT; + hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); @@ -3603,7 +3291,8 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; - BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); + BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), + skb->len); /* Time stamp */ __net_timestamp(skb); @@ -3648,7 +3337,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, /* Stand-alone HCI commands must be flagged as * single-command requests. */ - bt_cb(skb)->hci.req_start = true; + bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -3685,9 +3374,9 @@ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); - hci_req_lock(hdev); + hci_req_sync_lock(hdev); skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); - hci_req_unlock(hdev); + hci_req_sync_unlock(hdev); return skb; } @@ -3716,7 +3405,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, skb->len = skb_headlen(skb); skb->data_len = 0; - bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; + hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; switch (hdev->dev_type) { case HCI_BREDR: @@ -3756,7 +3445,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, do { skb = list; list = list->next; - bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; + hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); @@ -3794,7 +3483,7 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); - bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; + hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); @@ -4345,7 +4034,7 @@ static bool hci_req_is_complete(struct hci_dev *hdev) if (!skb) return true; - return bt_cb(skb)->hci.req_start; + return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) @@ -4405,20 +4094,20 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, * callback would be found in hdev->sent_cmd instead of the * command queue (hdev->cmd_q). */ - if (bt_cb(hdev->sent_cmd)->hci.req_complete) { - *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete; + if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) { + *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb; return; } - if (bt_cb(hdev->sent_cmd)->hci.req_complete_skb) { - *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb; + if (bt_cb(hdev->sent_cmd)->hci.req_complete) { + *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { - if (bt_cb(skb)->hci.req_start) { + if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } @@ -4453,7 +4142,7 @@ static void hci_rx_work(struct work_struct *work) if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ - switch (bt_cb(skb)->pkt_type) { + switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: kfree_skb(skb); @@ -4462,7 +4151,7 @@ static void hci_rx_work(struct work_struct *work) } /* Process frame */ - switch (bt_cb(skb)->pkt_type) { + switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d57c11c1c6b5..c162af5d16bf 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1183,7 +1183,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, hci_discovery_set_state(hdev, DISCOVERY_STOPPED); else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) && hdev->discovery.state == DISCOVERY_FINDING) - mgmt_reenable_advertising(hdev); + hci_req_reenable_advertising(hdev); break; @@ -2176,7 +2176,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES, sizeof(cp), &cp); - hci_update_page_scan(hdev); + hci_req_update_scan(hdev); } /* Set packet type for incoming connection */ @@ -2362,7 +2362,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) hci_remove_link_key(hdev, &conn->dst); - hci_update_page_scan(hdev); + hci_req_update_scan(hdev); } params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); @@ -2401,7 +2401,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * is timed out due to Directed Advertising." */ if (type == LE_LINK) - mgmt_reenable_advertising(hdev); + hci_req_reenable_advertising(hdev); unlock: hci_dev_unlock(hdev); @@ -3833,9 +3833,9 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, data.ssp_mode = 0x01; if (hci_dev_test_flag(hdev, HCI_MGMT)) - name_known = eir_has_data_type(info->data, - sizeof(info->data), - EIR_NAME_COMPLETE); + name_known = eir_get_data(info->data, + sizeof(info->data), + EIR_NAME_COMPLETE, NULL); else name_known = true; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 981f8a202c27..41b5f3813f02 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -21,12 +21,19 @@ SOFTWARE IS DISCLAIMED. */ +#include <asm/unaligned.h> + #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> #include "smp.h" #include "hci_request.h" +#define HCI_REQ_DONE 0 +#define HCI_REQ_PEND 1 +#define HCI_REQ_CANCELED 2 + void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); @@ -56,8 +63,12 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete, return -ENODATA; skb = skb_peek_tail(&req->cmd_q); - bt_cb(skb)->hci.req_complete = complete; - bt_cb(skb)->hci.req_complete_skb = complete_skb; + if (complete) { + bt_cb(skb)->hci.req_complete = complete; + } else if (complete_skb) { + bt_cb(skb)->hci.req_complete_skb = complete_skb; + bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB; + } spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); @@ -78,6 +89,203 @@ int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete) return req_run(req, NULL, complete); } +static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, + struct sk_buff *skb) +{ + BT_DBG("%s result 0x%2.2x", hdev->name, result); + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = result; + hdev->req_status = HCI_REQ_DONE; + if (skb) + hdev->req_skb = skb_get(skb); + wake_up_interruptible(&hdev->req_wait_q); + } +} + +void hci_req_sync_cancel(struct hci_dev *hdev, int err) +{ + BT_DBG("%s err 0x%2.2x", hdev->name, err); + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = err; + hdev->req_status = HCI_REQ_CANCELED; + wake_up_interruptible(&hdev->req_wait_q); + } +} + +struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u8 event, u32 timeout) +{ + DECLARE_WAITQUEUE(wait, current); + struct hci_request req; + struct sk_buff *skb; + int err = 0; + + BT_DBG("%s", hdev->name); + + hci_req_init(&req, hdev); + + hci_req_add_ev(&req, opcode, plen, param, event); + + hdev->req_status = HCI_REQ_PEND; + + add_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + err = hci_req_run_skb(&req, hci_req_sync_complete); + if (err < 0) { + remove_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_RUNNING); + return ERR_PTR(err); + } + + schedule_timeout(timeout); + + remove_wait_queue(&hdev->req_wait_q, &wait); + + if (signal_pending(current)) + return ERR_PTR(-EINTR); + + switch (hdev->req_status) { + case HCI_REQ_DONE: + err = -bt_to_errno(hdev->req_result); + break; + + case HCI_REQ_CANCELED: + err = -hdev->req_result; + break; + + default: + err = -ETIMEDOUT; + break; + } + + hdev->req_status = hdev->req_result = 0; + skb = hdev->req_skb; + hdev->req_skb = NULL; + + BT_DBG("%s end: err %d", hdev->name, err); + + if (err < 0) { + kfree_skb(skb); + return ERR_PTR(err); + } + + if (!skb) + return ERR_PTR(-ENODATA); + + return skb; +} +EXPORT_SYMBOL(__hci_cmd_sync_ev); + +struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param, u32 timeout) +{ + return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout); +} +EXPORT_SYMBOL(__hci_cmd_sync); + +/* Execute request and wait for completion. */ +int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, + unsigned long opt), + unsigned long opt, u32 timeout, u8 *hci_status) +{ + struct hci_request req; + DECLARE_WAITQUEUE(wait, current); + int err = 0; + + BT_DBG("%s start", hdev->name); + + hci_req_init(&req, hdev); + + hdev->req_status = HCI_REQ_PEND; + + err = func(&req, opt); + if (err) { + if (hci_status) + *hci_status = HCI_ERROR_UNSPECIFIED; + return err; + } + + add_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + err = hci_req_run_skb(&req, hci_req_sync_complete); + if (err < 0) { + hdev->req_status = 0; + + remove_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_RUNNING); + + /* ENODATA means the HCI request command queue is empty. + * This can happen when a request with conditionals doesn't + * trigger any commands to be sent. This is normal behavior + * and should not trigger an error return. + */ + if (err == -ENODATA) { + if (hci_status) + *hci_status = 0; + return 0; + } + + if (hci_status) + *hci_status = HCI_ERROR_UNSPECIFIED; + + return err; + } + + schedule_timeout(timeout); + + remove_wait_queue(&hdev->req_wait_q, &wait); + + if (signal_pending(current)) + return -EINTR; + + switch (hdev->req_status) { + case HCI_REQ_DONE: + err = -bt_to_errno(hdev->req_result); + if (hci_status) + *hci_status = hdev->req_result; + break; + + case HCI_REQ_CANCELED: + err = -hdev->req_result; + if (hci_status) + *hci_status = HCI_ERROR_UNSPECIFIED; + break; + + default: + err = -ETIMEDOUT; + if (hci_status) + *hci_status = HCI_ERROR_UNSPECIFIED; + break; + } + + hdev->req_status = hdev->req_result = 0; + + BT_DBG("%s end: err %d", hdev->name, err); + + return err; +} + +int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req, + unsigned long opt), + unsigned long opt, u32 timeout, u8 *hci_status) +{ + int ret; + + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENETDOWN; + + /* Serialize all requests */ + hci_req_sync_lock(hdev); + ret = __hci_req_sync(hdev, req, opt, timeout, hci_status); + hci_req_sync_unlock(hdev); + + return ret; +} + struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { @@ -98,8 +306,8 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, BT_DBG("skb len %d", skb->len); - bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; - bt_cb(skb)->hci.opcode = opcode; + hci_skb_pkt_type(skb) = HCI_COMMAND_PKT; + hci_skb_opcode(skb) = opcode; return skb; } @@ -128,7 +336,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, } if (skb_queue_empty(&req->cmd_q)) - bt_cb(skb)->hci.req_start = true; + bt_cb(skb)->hci.req_flags |= HCI_REQ_START; bt_cb(skb)->hci.req_event = event; @@ -141,6 +349,311 @@ void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, hci_req_add_ev(req, opcode, plen, param, 0); } +void __hci_req_write_fast_connectable(struct hci_request *req, bool enable) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_page_scan_activity acp; + u8 type; + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + return; + + if (hdev->hci_ver < BLUETOOTH_VER_1_2) + return; + + if (enable) { + type = PAGE_SCAN_TYPE_INTERLACED; + + /* 160 msec page scan interval */ + acp.interval = cpu_to_le16(0x0100); + } else { + type = PAGE_SCAN_TYPE_STANDARD; /* default */ + + /* default 1.28 sec page scan */ + acp.interval = cpu_to_le16(0x0800); + } + + acp.window = cpu_to_le16(0x0012); + + if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || + __cpu_to_le16(hdev->page_scan_window) != acp.window) + hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, + sizeof(acp), &acp); + + if (hdev->page_scan_type != type) + hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); +} + +/* This function controls the background scanning based on hdev->pend_le_conns + * list. If there are pending LE connection we start the background scanning, + * otherwise we stop it. + * + * This function requires the caller holds hdev->lock. + */ +static void __hci_update_background_scan(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + + if (!test_bit(HCI_UP, &hdev->flags) || + test_bit(HCI_INIT, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG) || + hci_dev_test_flag(hdev, HCI_AUTO_OFF) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return; + + /* No point in doing scanning if LE support hasn't been enabled */ + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return; + + /* If discovery is active don't interfere with it */ + if (hdev->discovery.state != DISCOVERY_STOPPED) + return; + + /* Reset RSSI and UUID filters when starting background scanning + * since these filters are meant for service discovery only. + * + * The Start Discovery and Start Service Discovery operations + * ensure to set proper values for RSSI threshold and UUID + * filter list. So it is safe to just reset them here. + */ + hci_discovery_filter_clear(hdev); + + if (list_empty(&hdev->pend_le_conns) && + list_empty(&hdev->pend_le_reports)) { + /* If there is no pending LE connections or devices + * to be scanned for, we should stop the background + * scanning. + */ + + /* If controller is not scanning we are done. */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) + return; + + hci_req_add_le_scan_disable(req); + + BT_DBG("%s stopping background scanning", hdev->name); + } else { + /* If there is at least one pending LE connection, we should + * keep the background scan running. + */ + + /* If controller is connecting, we should not start scanning + * since some controllers are not able to scan and connect at + * the same time. + */ + if (hci_lookup_le_connect(hdev)) + return; + + /* If controller is currently scanning, we stop it to ensure we + * don't miss any advertising (due to duplicates filter). + */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req); + + hci_req_add_le_passive_scan(req); + + BT_DBG("%s starting background scanning", hdev->name); + } +} + +void __hci_req_update_name(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_local_name cp; + + memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); + + hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); +} + +#define PNP_INFO_SVCLASS_ID 0x1200 + +static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 4) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + u16 uuid16; + + if (uuid->size != 16) + continue; + + uuid16 = get_unaligned_le16(&uuid->uuid[12]); + if (uuid16 < 0x1100) + continue; + + if (uuid16 == PNP_INFO_SVCLASS_ID) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID16_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u16) > len) { + uuids_start[1] = EIR_UUID16_SOME; + break; + } + + *ptr++ = (uuid16 & 0x00ff); + *ptr++ = (uuid16 & 0xff00) >> 8; + uuids_start[0] += sizeof(uuid16); + } + + return ptr; +} + +static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 6) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 32) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID32_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u32) > len) { + uuids_start[1] = EIR_UUID32_SOME; + break; + } + + memcpy(ptr, &uuid->uuid[12], sizeof(u32)); + ptr += sizeof(u32); + uuids_start[0] += sizeof(u32); + } + + return ptr; +} + +static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 18) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 128) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID128_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + 16 > len) { + uuids_start[1] = EIR_UUID128_SOME; + break; + } + + memcpy(ptr, uuid->uuid, 16); + ptr += 16; + uuids_start[0] += 16; + } + + return ptr; +} + +static void create_eir(struct hci_dev *hdev, u8 *data) +{ + u8 *ptr = data; + size_t name_len; + + name_len = strlen(hdev->dev_name); + + if (name_len > 0) { + /* EIR Data type */ + if (name_len > 48) { + name_len = 48; + ptr[1] = EIR_NAME_SHORT; + } else + ptr[1] = EIR_NAME_COMPLETE; + + /* EIR Data length */ + ptr[0] = name_len + 1; + + memcpy(ptr + 2, hdev->dev_name, name_len); + + ptr += (name_len + 2); + } + + if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 2; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8) hdev->inq_tx_power; + + ptr += 3; + } + + if (hdev->devid_source > 0) { + ptr[0] = 9; + ptr[1] = EIR_DEVICE_ID; + + put_unaligned_le16(hdev->devid_source, ptr + 2); + put_unaligned_le16(hdev->devid_vendor, ptr + 4); + put_unaligned_le16(hdev->devid_product, ptr + 6); + put_unaligned_le16(hdev->devid_version, ptr + 8); + + ptr += 10; + } + + ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); +} + +void __hci_req_update_eir(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_eir cp; + + if (!hdev_is_powered(hdev)) + return; + + if (!lmp_ext_inq_capable(hdev)) + return; + + if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) + return; + + if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) + return; + + memset(&cp, 0, sizeof(cp)); + + create_eir(hdev, cp.data); + + if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) + return; + + memcpy(hdev->eir, cp.data, sizeof(cp.data)); + + hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); +} + void hci_req_add_le_scan_disable(struct hci_request *req) { struct hci_cp_le_set_scan_enable cp; @@ -302,6 +815,483 @@ void hci_req_add_le_passive_scan(struct hci_request *req) &enable_cp); } +static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) +{ + u8 instance = hdev->cur_adv_instance; + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Take into account the "appearance" and "local-name" flags here. + * These are currently being ignored as they are not supported. + */ + return adv_instance->scan_rsp_len; +} + +void __hci_req_disable_advertising(struct hci_request *req) +{ + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + +static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) +{ + u32 flags; + struct adv_info *adv_instance; + + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; + + return flags; + } + + adv_instance = hci_find_adv_instance(hdev, instance); + + /* Return 0 when we got an invalid instance identifier. */ + if (!adv_instance) + return 0; + + return adv_instance->flags; +} + +void __hci_req_enable_advertising(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_adv_param cp; + u8 own_addr_type, enable = 0x01; + bool connectable; + u32 flags; + + if (hci_conn_num(hdev, LE_LINK) > 0) + return; + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + __hci_req_disable_advertising(req); + + /* Clear the HCI_LE_ADV bit temporarily so that the + * hci_update_random_address knows that it's safe to go ahead + * and write a new random address. The flag will be set back on + * as soon as the SET_ADV_ENABLE HCI command completes. + */ + hci_dev_clear_flag(hdev, HCI_LE_ADV); + + flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); + + /* If the "connectable" instance flag was not set, then choose between + * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. + */ + connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || + mgmt_get_connectable(hdev); + + /* Set require_privacy to true only when non-connectable + * advertising is used. In that case it is fine to use a + * non-resolvable private address. + */ + if (hci_update_random_address(req, !connectable, &own_addr_type) < 0) + return; + + memset(&cp, 0, sizeof(cp)); + cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval); + cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval); + + if (connectable) + cp.type = LE_ADV_IND; + else if (get_cur_adv_instance_scan_rsp_len(hdev)) + cp.type = LE_ADV_SCAN_IND; + else + cp.type = LE_ADV_NONCONN_IND; + + cp.own_address_type = own_addr_type; + cp.channel_map = hdev->le_adv_channel_map; + + hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + +static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +{ + u8 ad_len = 0; + size_t name_len; + + name_len = strlen(hdev->dev_name); + if (name_len > 0) { + size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2; + + if (name_len > max_len) { + name_len = max_len; + ptr[1] = EIR_NAME_SHORT; + } else + ptr[1] = EIR_NAME_COMPLETE; + + ptr[0] = name_len + 1; + + memcpy(ptr + 2, hdev->dev_name, name_len); + + ad_len += (name_len + 2); + ptr += (name_len + 2); + } + + return ad_len; +} + +static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, + u8 *ptr) +{ + struct adv_info *adv_instance; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Set the appropriate entries based on advertising instance flags + * here once flags other than 0 are supported. + */ + memcpy(ptr, adv_instance->scan_rsp_data, + adv_instance->scan_rsp_len); + + return adv_instance->scan_rsp_len; +} + +void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_scan_rsp_data cp; + u8 len; + + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return; + + memset(&cp, 0, sizeof(cp)); + + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); + + if (hdev->scan_rsp_data_len == len && + !memcmp(cp.data, hdev->scan_rsp_data, len)) + return; + + memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + hdev->scan_rsp_data_len = len; + + cp.length = len; + + hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); +} + +static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv_instance = NULL; + u8 ad_len = 0, flags = 0; + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + } + + instance_flags = get_adv_instance_flags(hdev, instance); + + /* The Add Advertising command allows userspace to set both the general + * and limited discoverable flags. + */ + if (instance_flags & MGMT_ADV_FLAG_DISCOV) + flags |= LE_AD_GENERAL; + + if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) + flags |= LE_AD_LIMITED; + + if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { + /* If a discovery flag wasn't provided, simply use the global + * settings. + */ + if (!flags) + flags |= mgmt_get_adv_discov_flags(hdev); + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + + /* If flags would still be empty, then there is no need to + * include the "Flags" AD field". + */ + if (flags) { + ptr[0] = 0x02; + ptr[1] = EIR_FLAGS; + ptr[2] = flags; + + ad_len += 3; + ptr += 3; + } + } + + if (adv_instance) { + memcpy(ptr, adv_instance->adv_data, + adv_instance->adv_data_len); + ad_len += adv_instance->adv_data_len; + ptr += adv_instance->adv_data_len; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && + (instance_flags & MGMT_ADV_FLAG_TX_POWER)) { + ptr[0] = 0x02; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)hdev->adv_tx_power; + + ad_len += 3; + ptr += 3; + } + + return ad_len; +} + +void __hci_req_update_adv_data(struct hci_request *req, u8 instance) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_adv_data cp; + u8 len; + + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return; + + memset(&cp, 0, sizeof(cp)); + + len = create_instance_adv_data(hdev, instance, cp.data); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return; + + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + + hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); +} + +int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) +{ + struct hci_request req; + + hci_req_init(&req, hdev); + __hci_req_update_adv_data(&req, instance); + + return hci_req_run(&req, NULL); +} + +static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) +{ + BT_DBG("%s status %u", hdev->name, status); +} + +void hci_req_reenable_advertising(struct hci_dev *hdev) +{ + struct hci_request req; + + if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && + list_empty(&hdev->adv_instances)) + return; + + hci_req_init(&req, hdev); + + if (hdev->cur_adv_instance) { + __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance, + true); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); + } + + hci_req_run(&req, adv_enable_complete); +} + +static void adv_timeout_expire(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + adv_instance_expire.work); + + struct hci_request req; + u8 instance; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + hdev->adv_instance_timeout = 0; + + instance = hdev->cur_adv_instance; + if (instance == 0x00) + goto unlock; + + hci_req_init(&req, hdev); + + hci_req_clear_adv_instance(hdev, &req, instance, false); + + if (list_empty(&hdev->adv_instances)) + __hci_req_disable_advertising(&req); + + hci_req_run(&req, NULL); + +unlock: + hci_dev_unlock(hdev); +} + +int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, + bool force) +{ + struct hci_dev *hdev = req->hdev; + struct adv_info *adv_instance = NULL; + u16 timeout; + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + list_empty(&hdev->adv_instances)) + return -EPERM; + + if (hdev->adv_instance_timeout) + return -EBUSY; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -ENOENT; + + /* A zero timeout means unlimited advertising. As long as there is + * only one instance, duration should be ignored. We still set a timeout + * in case further instances are being added later on. + * + * If the remaining lifetime of the instance is more than the duration + * then the timeout corresponds to the duration, otherwise it will be + * reduced to the remaining instance lifetime. + */ + if (adv_instance->timeout == 0 || + adv_instance->duration <= adv_instance->remaining_time) + timeout = adv_instance->duration; + else + timeout = adv_instance->remaining_time; + + /* The remaining time is being reduced unless the instance is being + * advertised without time limit. + */ + if (adv_instance->timeout) + adv_instance->remaining_time = + adv_instance->remaining_time - timeout; + + hdev->adv_instance_timeout = timeout; + queue_delayed_work(hdev->req_workqueue, + &hdev->adv_instance_expire, + msecs_to_jiffies(timeout * 1000)); + + /* If we're just re-scheduling the same instance again then do not + * execute any HCI commands. This happens when a single instance is + * being advertised. + */ + if (!force && hdev->cur_adv_instance == instance && + hci_dev_test_flag(hdev, HCI_LE_ADV)) + return 0; + + hdev->cur_adv_instance = instance; + __hci_req_update_adv_data(req, instance); + __hci_req_update_scan_rsp_data(req, instance); + __hci_req_enable_advertising(req); + + return 0; +} + +static void cancel_adv_timeout(struct hci_dev *hdev) +{ + if (hdev->adv_instance_timeout) { + hdev->adv_instance_timeout = 0; + cancel_delayed_work(&hdev->adv_instance_expire); + } +} + +/* For a single instance: + * - force == true: The instance will be removed even when its remaining + * lifetime is not zero. + * - force == false: the instance will be deactivated but kept stored unless + * the remaining lifetime is zero. + * + * For instance == 0x00: + * - force == true: All instances will be removed regardless of their timeout + * setting. + * - force == false: Only instances that have a timeout will be removed. + */ +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, + u8 instance, bool force) +{ + struct adv_info *adv_instance, *n, *next_instance = NULL; + int err; + u8 rem_inst; + + /* Cancel any timeout concerning the removed instance(s). */ + if (!instance || hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + /* Get the next instance to advertise BEFORE we remove + * the current one. This can be the same instance again + * if there is only one instance. + */ + if (instance && hdev->cur_adv_instance == instance) + next_instance = hci_get_next_instance(hdev, instance); + + if (instance == 0x00) { + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, + list) { + if (!(force || adv_instance->timeout)) + continue; + + rem_inst = adv_instance->instance; + err = hci_remove_adv_instance(hdev, rem_inst); + if (!err) + mgmt_advertising_removed(NULL, hdev, rem_inst); + } + } else { + adv_instance = hci_find_adv_instance(hdev, instance); + + if (force || (adv_instance && adv_instance->timeout && + !adv_instance->remaining_time)) { + /* Don't advertise a removed instance. */ + if (next_instance && + next_instance->instance == instance) + next_instance = NULL; + + err = hci_remove_adv_instance(hdev, instance); + if (!err) + mgmt_advertising_removed(NULL, hdev, instance); + } + } + + if (!req || !hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return; + + if (next_instance) + __hci_req_schedule_adv_instance(req, next_instance->instance, + false); +} + static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) { struct hci_dev *hdev = req->hdev; @@ -432,7 +1422,7 @@ static bool disconnected_whitelist_entries(struct hci_dev *hdev) return false; } -void __hci_update_page_scan(struct hci_request *req) +void __hci_req_update_scan(struct hci_request *req) { struct hci_dev *hdev = req->hdev; u8 scan; @@ -452,117 +1442,168 @@ void __hci_update_page_scan(struct hci_request *req) else scan = SCAN_DISABLED; - if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE)) - return; - if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) scan |= SCAN_INQUIRY; + if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) && + test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY)) + return; + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } -void hci_update_page_scan(struct hci_dev *hdev) +static int update_scan(struct hci_request *req, unsigned long opt) { - struct hci_request req; + hci_dev_lock(req->hdev); + __hci_req_update_scan(req); + hci_dev_unlock(req->hdev); + return 0; +} - hci_req_init(&req, hdev); - __hci_update_page_scan(&req); - hci_req_run(&req, NULL); +static void scan_update_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, scan_update); + + hci_req_sync(hdev, update_scan, 0, HCI_CMD_TIMEOUT, NULL); } -/* This function controls the background scanning based on hdev->pend_le_conns - * list. If there are pending LE connection we start the background scanning, - * otherwise we stop it. - * - * This function requires the caller holds hdev->lock. - */ -void __hci_update_background_scan(struct hci_request *req) +static int connectable_update(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; - if (!test_bit(HCI_UP, &hdev->flags) || - test_bit(HCI_INIT, &hdev->flags) || - hci_dev_test_flag(hdev, HCI_SETUP) || - hci_dev_test_flag(hdev, HCI_CONFIG) || - hci_dev_test_flag(hdev, HCI_AUTO_OFF) || - hci_dev_test_flag(hdev, HCI_UNREGISTER)) + hci_dev_lock(hdev); + + __hci_req_update_scan(req); + + /* If BR/EDR is not enabled and we disable advertising as a + * by-product of disabling connectable, we need to update the + * advertising flags. + */ + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + __hci_req_update_adv_data(req, hdev->cur_adv_instance); + + /* Update the advertising parameters if necessary */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + !list_empty(&hdev->adv_instances)) + __hci_req_enable_advertising(req); + + __hci_update_background_scan(req); + + hci_dev_unlock(hdev); + + return 0; +} + +static void connectable_update_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + connectable_update); + u8 status; + + hci_req_sync(hdev, connectable_update, 0, HCI_CMD_TIMEOUT, &status); + mgmt_set_connectable_complete(hdev, status); +} + +static u8 get_service_classes(struct hci_dev *hdev) +{ + struct bt_uuid *uuid; + u8 val = 0; + + list_for_each_entry(uuid, &hdev->uuids, list) + val |= uuid->svc_hint; + + return val; +} + +void __hci_req_update_class(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + u8 cod[3]; + + BT_DBG("%s", hdev->name); + + if (!hdev_is_powered(hdev)) return; - /* No point in doing scanning if LE support hasn't been enabled */ - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return; - /* If discovery is active don't interfere with it */ - if (hdev->discovery.state != DISCOVERY_STOPPED) + if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return; - /* Reset RSSI and UUID filters when starting background scanning - * since these filters are meant for service discovery only. - * - * The Start Discovery and Start Service Discovery operations - * ensure to set proper values for RSSI threshold and UUID - * filter list. So it is safe to just reset them here. - */ - hci_discovery_filter_clear(hdev); + cod[0] = hdev->minor_class; + cod[1] = hdev->major_class; + cod[2] = get_service_classes(hdev); - if (list_empty(&hdev->pend_le_conns) && - list_empty(&hdev->pend_le_reports)) { - /* If there is no pending LE connections or devices - * to be scanned for, we should stop the background - * scanning. - */ + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) + cod[1] |= 0x20; - /* If controller is not scanning we are done. */ - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return; + if (memcmp(cod, hdev->dev_class, 3) == 0) + return; - hci_req_add_le_scan_disable(req); + hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); +} - BT_DBG("%s stopping background scanning", hdev->name); +static void write_iac(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_write_current_iac_lap cp; + + if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) + return; + + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { + /* Limited discoverable mode */ + cp.num_iac = min_t(u8, hdev->num_iac, 2); + cp.iac_lap[0] = 0x00; /* LIAC */ + cp.iac_lap[1] = 0x8b; + cp.iac_lap[2] = 0x9e; + cp.iac_lap[3] = 0x33; /* GIAC */ + cp.iac_lap[4] = 0x8b; + cp.iac_lap[5] = 0x9e; } else { - /* If there is at least one pending LE connection, we should - * keep the background scan running. - */ + /* General discoverable mode */ + cp.num_iac = 1; + cp.iac_lap[0] = 0x33; /* GIAC */ + cp.iac_lap[1] = 0x8b; + cp.iac_lap[2] = 0x9e; + } - /* If controller is connecting, we should not start scanning - * since some controllers are not able to scan and connect at - * the same time. - */ - if (hci_lookup_le_connect(hdev)) - return; + hci_req_add(req, HCI_OP_WRITE_CURRENT_IAC_LAP, + (cp.num_iac * 3) + 1, &cp); +} - /* If controller is currently scanning, we stop it to ensure we - * don't miss any advertising (due to duplicates filter). - */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); +static int discoverable_update(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; - hci_req_add_le_passive_scan(req); + hci_dev_lock(hdev); - BT_DBG("%s starting background scanning", hdev->name); + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { + write_iac(req); + __hci_req_update_scan(req); + __hci_req_update_class(req); } -} -static void update_background_scan_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - if (status) - BT_DBG("HCI request failed to update background scanning: " - "status 0x%2.2x", status); -} + /* Advertising instances don't use the global discoverable setting, so + * only update AD if advertising was enabled using Set Advertising. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) + __hci_req_update_adv_data(req, 0x00); -void hci_update_background_scan(struct hci_dev *hdev) -{ - int err; - struct hci_request req; + hci_dev_unlock(hdev); - hci_req_init(&req, hdev); + return 0; +} - __hci_update_background_scan(&req); +static void discoverable_update_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + discoverable_update); + u8 status; - err = hci_req_run(&req, update_background_scan_complete); - if (err && err != -ENODATA) - BT_ERR("Failed to run HCI request: err %d", err); + hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, &status); + mgmt_set_discoverable_complete(hdev, status); } void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, @@ -657,3 +1698,574 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) return 0; } + +static int update_bg_scan(struct hci_request *req, unsigned long opt) +{ + hci_dev_lock(req->hdev); + __hci_update_background_scan(req); + hci_dev_unlock(req->hdev); + return 0; +} + +static void bg_scan_update(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + bg_scan_update); + struct hci_conn *conn; + u8 status; + int err; + + err = hci_req_sync(hdev, update_bg_scan, 0, HCI_CMD_TIMEOUT, &status); + if (!err) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + if (conn) + hci_le_conn_failed(conn, status); + + hci_dev_unlock(hdev); +} + +static int le_scan_disable(struct hci_request *req, unsigned long opt) +{ + hci_req_add_le_scan_disable(req); + return 0; +} + +static int bredr_inquiry(struct hci_request *req, unsigned long opt) +{ + u8 length = opt; + const u8 giac[3] = { 0x33, 0x8b, 0x9e }; + const u8 liac[3] = { 0x00, 0x8b, 0x9e }; + struct hci_cp_inquiry cp; + + BT_DBG("%s", req->hdev->name); + + hci_dev_lock(req->hdev); + hci_inquiry_cache_flush(req->hdev); + hci_dev_unlock(req->hdev); + + memset(&cp, 0, sizeof(cp)); + + if (req->hdev->discovery.limited) + memcpy(&cp.lap, liac, sizeof(cp.lap)); + else + memcpy(&cp.lap, giac, sizeof(cp.lap)); + + cp.length = length; + + hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); + + return 0; +} + +static void le_scan_disable_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + le_scan_disable.work); + u8 status; + + BT_DBG("%s", hdev->name); + + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) + return; + + cancel_delayed_work(&hdev->le_scan_restart); + + hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status); + if (status) { + BT_ERR("Failed to disable LE scan: status 0x%02x", status); + return; + } + + hdev->discovery.scan_start = 0; + + /* If we were running LE only scan, change discovery state. If + * we were running both LE and BR/EDR inquiry simultaneously, + * and BR/EDR inquiry is already finished, stop discovery, + * otherwise BR/EDR inquiry will stop discovery when finished. + * If we will resolve remote device name, do not change + * discovery state. + */ + + if (hdev->discovery.type == DISCOV_TYPE_LE) + goto discov_stopped; + + if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) + return; + + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { + if (!test_bit(HCI_INQUIRY, &hdev->flags) && + hdev->discovery.state != DISCOVERY_RESOLVING) + goto discov_stopped; + + return; + } + + hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN, + HCI_CMD_TIMEOUT, &status); + if (status) { + BT_ERR("Inquiry failed: status 0x%02x", status); + goto discov_stopped; + } + + return; + +discov_stopped: + hci_dev_lock(hdev); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + hci_dev_unlock(hdev); +} + +static int le_scan_restart(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_scan_enable cp; + + /* If controller is not scanning we are done. */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) + return 0; + + hci_req_add_le_scan_disable(req); + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_ENABLE; + cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + + return 0; +} + +static void le_scan_restart_work(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + le_scan_restart.work); + unsigned long timeout, duration, scan_start, now; + u8 status; + + BT_DBG("%s", hdev->name); + + hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status); + if (status) { + BT_ERR("Failed to restart LE scan: status %d", status); + return; + } + + hci_dev_lock(hdev); + + if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || + !hdev->discovery.scan_start) + goto unlock; + + /* When the scan was started, hdev->le_scan_disable has been queued + * after duration from scan_start. During scan restart this job + * has been canceled, and we need to queue it again after proper + * timeout, to make sure that scan does not run indefinitely. + */ + duration = hdev->discovery.scan_duration; + scan_start = hdev->discovery.scan_start; + now = jiffies; + if (now - scan_start <= duration) { + int elapsed; + + if (now >= scan_start) + elapsed = now - scan_start; + else + elapsed = ULONG_MAX - scan_start + now; + + timeout = duration - elapsed; + } else { + timeout = 0; + } + + queue_delayed_work(hdev->req_workqueue, + &hdev->le_scan_disable, timeout); + +unlock: + hci_dev_unlock(hdev); +} + +static void disable_advertising(struct hci_request *req) +{ + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); +} + +static int active_scan(struct hci_request *req, unsigned long opt) +{ + uint16_t interval = opt; + struct hci_dev *hdev = req->hdev; + struct hci_cp_le_set_scan_param param_cp; + struct hci_cp_le_set_scan_enable enable_cp; + u8 own_addr_type; + int err; + + BT_DBG("%s", hdev->name); + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { + hci_dev_lock(hdev); + + /* Don't let discovery abort an outgoing connection attempt + * that's using directed advertising. + */ + if (hci_lookup_le_connect(hdev)) { + hci_dev_unlock(hdev); + return -EBUSY; + } + + cancel_adv_timeout(hdev); + hci_dev_unlock(hdev); + + disable_advertising(req); + } + + /* If controller is scanning, it means the background scanning is + * running. Thus, we should temporarily stop it in order to set the + * discovery scanning parameters. + */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) + hci_req_add_le_scan_disable(req); + + /* All active scans will be done with either a resolvable private + * address (when privacy feature has been enabled) or non-resolvable + * private address. + */ + err = hci_update_random_address(req, true, &own_addr_type); + if (err < 0) + own_addr_type = ADDR_LE_DEV_PUBLIC; + + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = LE_SCAN_ACTIVE; + param_cp.interval = cpu_to_le16(interval); + param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); + param_cp.own_address_type = own_addr_type; + + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); + + return 0; +} + +static int interleaved_discov(struct hci_request *req, unsigned long opt) +{ + int err; + + BT_DBG("%s", req->hdev->name); + + err = active_scan(req, opt); + if (err) + return err; + + return bredr_inquiry(req, DISCOV_BREDR_INQUIRY_LEN); +} + +static void start_discovery(struct hci_dev *hdev, u8 *status) +{ + unsigned long timeout; + + BT_DBG("%s type %u", hdev->name, hdev->discovery.type); + + switch (hdev->discovery.type) { + case DISCOV_TYPE_BREDR: + if (!hci_dev_test_flag(hdev, HCI_INQUIRY)) + hci_req_sync(hdev, bredr_inquiry, + DISCOV_BREDR_INQUIRY_LEN, HCI_CMD_TIMEOUT, + status); + return; + case DISCOV_TYPE_INTERLEAVED: + /* When running simultaneous discovery, the LE scanning time + * should occupy the whole discovery time sine BR/EDR inquiry + * and LE scanning are scheduled by the controller. + * + * For interleaving discovery in comparison, BR/EDR inquiry + * and LE scanning are done sequentially with separate + * timeouts. + */ + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, + &hdev->quirks)) { + timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); + /* During simultaneous discovery, we double LE scan + * interval. We must leave some time for the controller + * to do BR/EDR inquiry. + */ + hci_req_sync(hdev, interleaved_discov, + DISCOV_LE_SCAN_INT * 2, HCI_CMD_TIMEOUT, + status); + break; + } + + timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); + hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + HCI_CMD_TIMEOUT, status); + break; + case DISCOV_TYPE_LE: + timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); + hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT, + HCI_CMD_TIMEOUT, status); + break; + default: + *status = HCI_ERROR_UNSPECIFIED; + return; + } + + if (*status) + return; + + BT_DBG("%s timeout %u ms", hdev->name, jiffies_to_msecs(timeout)); + + /* When service discovery is used and the controller has a + * strict duplicate filter, it is important to remember the + * start and duration of the scan. This is required for + * restarting scanning during the discovery phase. + */ + if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && + hdev->discovery.result_filtering) { + hdev->discovery.scan_start = jiffies; + hdev->discovery.scan_duration = timeout; + } + + queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable, + timeout); +} + +bool hci_req_stop_discovery(struct hci_request *req) +{ + struct hci_dev *hdev = req->hdev; + struct discovery_state *d = &hdev->discovery; + struct hci_cp_remote_name_req_cancel cp; + struct inquiry_entry *e; + bool ret = false; + + BT_DBG("%s state %u", hdev->name, hdev->discovery.state); + + if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { + if (test_bit(HCI_INQUIRY, &hdev->flags)) + hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); + + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { + cancel_delayed_work(&hdev->le_scan_disable); + hci_req_add_le_scan_disable(req); + } + + ret = true; + } else { + /* Passive scanning */ + if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { + hci_req_add_le_scan_disable(req); + ret = true; + } + } + + /* No further actions needed for LE-only discovery */ + if (d->type == DISCOV_TYPE_LE) + return ret; + + if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) { + e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, + NAME_PENDING); + if (!e) + return ret; + + bacpy(&cp.bdaddr, &e->data.bdaddr); + hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), + &cp); + ret = true; + } + + return ret; +} + +static int stop_discovery(struct hci_request *req, unsigned long opt) +{ + hci_dev_lock(req->hdev); + hci_req_stop_discovery(req); + hci_dev_unlock(req->hdev); + + return 0; +} + +static void discov_update(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + discov_update); + u8 status = 0; + + switch (hdev->discovery.state) { + case DISCOVERY_STARTING: + start_discovery(hdev, &status); + mgmt_start_discovery_complete(hdev, status); + if (status) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + else + hci_discovery_set_state(hdev, DISCOVERY_FINDING); + break; + case DISCOVERY_STOPPING: + hci_req_sync(hdev, stop_discovery, 0, HCI_CMD_TIMEOUT, &status); + mgmt_stop_discovery_complete(hdev, status); + if (!status) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + break; + case DISCOVERY_STOPPED: + default: + return; + } +} + +static void discov_off(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + discov_off.work); + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + /* When discoverable timeout triggers, then just make sure + * the limited discoverable flag is cleared. Even in the case + * of a timeout triggered from general discoverable, it is + * safe to unconditionally clear the flag. + */ + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); + hdev->discov_timeout = 0; + + hci_dev_unlock(hdev); + + hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, NULL); + mgmt_new_settings(hdev); +} + +static int powered_update_hci(struct hci_request *req, unsigned long opt) +{ + struct hci_dev *hdev = req->hdev; + u8 link_sec; + + hci_dev_lock(hdev); + + if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) && + !lmp_host_ssp_capable(hdev)) { + u8 mode = 0x01; + + hci_req_add(req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode); + + if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) { + u8 support = 0x01; + + hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT, + sizeof(support), &support); + } + } + + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && + lmp_bredr_capable(hdev)) { + struct hci_cp_write_le_host_supported cp; + + cp.le = 0x01; + cp.simul = 0x00; + + /* Check first if we already have the right + * host state (host features set) + */ + if (cp.le != lmp_host_le_capable(hdev) || + cp.simul != lmp_host_le_br_capable(hdev)) + hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED, + sizeof(cp), &cp); + } + + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + /* Make sure the controller has a good default for + * advertising data. This also applies to the case + * where BR/EDR was toggled during the AUTO_OFF phase. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + list_empty(&hdev->adv_instances)) { + __hci_req_update_adv_data(req, 0x00); + __hci_req_update_scan_rsp_data(req, 0x00); + + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) + __hci_req_enable_advertising(req); + } else if (!list_empty(&hdev->adv_instances)) { + struct adv_info *adv_instance; + + adv_instance = list_first_entry(&hdev->adv_instances, + struct adv_info, list); + __hci_req_schedule_adv_instance(req, + adv_instance->instance, + true); + } + } + + link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); + if (link_sec != test_bit(HCI_AUTH, &hdev->flags)) + hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, + sizeof(link_sec), &link_sec); + + if (lmp_bredr_capable(hdev)) { + if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) + __hci_req_write_fast_connectable(req, true); + else + __hci_req_write_fast_connectable(req, false); + __hci_req_update_scan(req); + __hci_req_update_class(req); + __hci_req_update_name(req); + __hci_req_update_eir(req); + } + + hci_dev_unlock(hdev); + return 0; +} + +int __hci_req_hci_power_on(struct hci_dev *hdev) +{ + /* Register the available SMP channels (BR/EDR and LE) only when + * successfully powering on the controller. This late + * registration is required so that LE SMP can clearly decide if + * the public address or static address is used. + */ + smp_register(hdev); + + return __hci_req_sync(hdev, powered_update_hci, 0, HCI_CMD_TIMEOUT, + NULL); +} + +void hci_request_setup(struct hci_dev *hdev) +{ + INIT_WORK(&hdev->discov_update, discov_update); + INIT_WORK(&hdev->bg_scan_update, bg_scan_update); + INIT_WORK(&hdev->scan_update, scan_update_work); + INIT_WORK(&hdev->connectable_update, connectable_update_work); + INIT_WORK(&hdev->discoverable_update, discoverable_update_work); + INIT_DELAYED_WORK(&hdev->discov_off, discov_off); + INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); + INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); + INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); +} + +void hci_request_cancel_all(struct hci_dev *hdev) +{ + hci_req_sync_cancel(hdev, ENODEV); + + cancel_work_sync(&hdev->discov_update); + cancel_work_sync(&hdev->bg_scan_update); + cancel_work_sync(&hdev->scan_update); + cancel_work_sync(&hdev->connectable_update); + cancel_work_sync(&hdev->discoverable_update); + cancel_delayed_work_sync(&hdev->discov_off); + cancel_delayed_work_sync(&hdev->le_scan_disable); + cancel_delayed_work_sync(&hdev->le_scan_restart); + + if (hdev->adv_instance_timeout) { + cancel_delayed_work_sync(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; + } +} diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 25c7f1305dcb..64ff8c040d50 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -20,6 +20,9 @@ SOFTWARE IS DISCLAIMED. */ +#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock) +#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock) + struct hci_request { struct hci_dev *hdev; struct sk_buff_head cmd_q; @@ -41,21 +44,61 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb); +int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req, + unsigned long opt), + unsigned long opt, u32 timeout, u8 *hci_status); +int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, + unsigned long opt), + unsigned long opt, u32 timeout, u8 *hci_status); +void hci_req_sync_cancel(struct hci_dev *hdev, int err); + struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param); +int __hci_req_hci_power_on(struct hci_dev *hdev); + +void __hci_req_write_fast_connectable(struct hci_request *req, bool enable); +void __hci_req_update_name(struct hci_request *req); +void __hci_req_update_eir(struct hci_request *req); + void hci_req_add_le_scan_disable(struct hci_request *req); void hci_req_add_le_passive_scan(struct hci_request *req); -void hci_update_page_scan(struct hci_dev *hdev); -void __hci_update_page_scan(struct hci_request *req); +void hci_req_reenable_advertising(struct hci_dev *hdev); +void __hci_req_enable_advertising(struct hci_request *req); +void __hci_req_disable_advertising(struct hci_request *req); +void __hci_req_update_adv_data(struct hci_request *req, u8 instance); +int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance); +void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance); + +int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, + bool force); +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, + u8 instance, bool force); + +void __hci_req_update_class(struct hci_request *req); + +/* Returns true if HCI commands were queued */ +bool hci_req_stop_discovery(struct hci_request *req); + +static inline void hci_req_update_scan(struct hci_dev *hdev) +{ + queue_work(hdev->req_workqueue, &hdev->scan_update); +} + +void __hci_req_update_scan(struct hci_request *req); int hci_update_random_address(struct hci_request *req, bool require_privacy, u8 *own_addr_type); -void hci_update_background_scan(struct hci_dev *hdev); -void __hci_update_background_scan(struct hci_request *req); - int hci_abort_conn(struct hci_conn *conn, u8 reason); void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, u8 reason); + +static inline void hci_update_background_scan(struct hci_dev *hdev) +{ + queue_work(hdev->req_workqueue, &hdev->bg_scan_update); +} + +void hci_request_setup(struct hci_dev *hdev); +void hci_request_cancel_all(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index b1eb8c09a660..1298d723c0e0 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -25,6 +25,7 @@ /* Bluetooth HCI sockets. */ #include <linux/export.h> +#include <linux/utsname.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> @@ -120,13 +121,13 @@ static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb) /* Apply filter */ flt = &hci_pi(sk)->filter; - flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS; + flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS; if (!test_bit(flt_type, &flt->type_mask)) return true; /* Extra filter for event packets only */ - if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT) + if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT) return false; flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); @@ -170,19 +171,19 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) { - if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT && - bt_cb(skb)->pkt_type != HCI_EVENT_PKT && - bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) + if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && + hci_skb_pkt_type(skb) != HCI_EVENT_PKT && + hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) continue; if (is_filtered_packet(sk, skb)) continue; } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { if (!bt_cb(skb)->incoming) continue; - if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT && - bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) + if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && + hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -196,7 +197,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; /* Put type byte before the data */ - memcpy(skb_push(skb_copy, 1), &bt_cb(skb)->pkt_type, 1); + memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); @@ -262,7 +263,7 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("hdev %p len %d", hdev, skb->len); - switch (bt_cb(skb)->pkt_type) { + switch (hci_skb_pkt_type(skb)) { case HCI_COMMAND_PKT: opcode = cpu_to_le16(HCI_MON_COMMAND_PKT); break; @@ -294,7 +295,7 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) return; /* Put header before the data */ - hdr = (void *) skb_push(skb_copy, HCI_MON_HDR_SIZE); + hdr = (void *)skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len); @@ -375,7 +376,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) __net_timestamp(skb); - hdr = (void *) skb_push(skb, HCI_MON_HDR_SIZE); + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = opcode; hdr->index = cpu_to_le16(hdev->id); hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); @@ -383,6 +384,38 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) return skb; } +static void __printf(2, 3) +send_monitor_note(struct sock *sk, const char *fmt, ...) +{ + size_t len; + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + va_list args; + + va_start(args, fmt); + len = vsnprintf(NULL, 0, fmt, args); + va_end(args); + + skb = bt_skb_alloc(len + 1, GFP_ATOMIC); + if (!skb) + return; + + va_start(args, fmt); + vsprintf(skb_put(skb, len), fmt, args); + *skb_put(skb, 1) = 0; + va_end(args); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE); + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + if (sock_queue_rcv_skb(sk, skb)) + kfree_skb(skb); +} + static void send_monitor_replay(struct sock *sk) { struct hci_dev *hdev; @@ -436,18 +469,18 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) if (!skb) return; - hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE); + hdr = (void *)skb_put(skb, HCI_EVENT_HDR_SIZE); hdr->evt = HCI_EV_STACK_INTERNAL; hdr->plen = sizeof(*ev) + dlen; - ev = (void *) skb_put(skb, sizeof(*ev) + dlen); + ev = (void *)skb_put(skb, sizeof(*ev) + dlen); ev->type = type; memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; __net_timestamp(skb); - bt_cb(skb)->pkt_type = HCI_EVENT_PKT; + hci_skb_pkt_type(skb) = HCI_EVENT_PKT; hci_send_to_sock(hdev, skb); kfree_skb(skb); } @@ -653,20 +686,20 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, return -EOPNOTSUPP; case HCIGETCONNINFO: - return hci_get_conn_info(hdev, (void __user *) arg); + return hci_get_conn_info(hdev, (void __user *)arg); case HCIGETAUTHINFO: - return hci_get_auth_info(hdev, (void __user *) arg); + return hci_get_auth_info(hdev, (void __user *)arg); case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_add(hdev, (void __user *) arg); + return hci_sock_blacklist_add(hdev, (void __user *)arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return hci_sock_blacklist_del(hdev, (void __user *) arg); + return hci_sock_blacklist_del(hdev, (void __user *)arg); } return -ENOIOCTLCMD; @@ -675,7 +708,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - void __user *argp = (void __user *) arg; + void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; int err; @@ -872,11 +905,28 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, */ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + send_monitor_note(sk, "Linux version %s (%s)", + init_utsname()->release, + init_utsname()->machine); + send_monitor_note(sk, "Bluetooth subsystem version %s", + BT_SUBSYS_VERSION); send_monitor_replay(sk); atomic_inc(&monitor_promisc); break; + case HCI_CHANNEL_LOGGING: + if (haddr.hci_dev != HCI_DEV_NONE) { + err = -EINVAL; + goto done; + } + + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + goto done; + } + break; + default: if (!hci_mgmt_chan_find(haddr.hci_channel)) { err = -EINVAL; @@ -926,7 +976,7 @@ done: static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) { - struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr; + struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; struct hci_dev *hdev; int err = 0; @@ -991,8 +1041,8 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, } } -static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; @@ -1004,6 +1054,9 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (flags & MSG_OOB) return -EOPNOTSUPP; + if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING) + return -EOPNOTSUPP; + if (sk->sk_state == BT_CLOSED) return 0; @@ -1150,6 +1203,90 @@ done: return err; } +static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + struct hci_dev *hdev; + u16 index; + int err; + + /* The logging frame consists at minimum of the standard header, + * the priority byte, the ident length byte and at least one string + * terminator NUL byte. Anything shorter are invalid packets. + */ + if (len < sizeof(*hdr) + 3) + return -EINVAL; + + skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + if (memcpy_from_msg(skb_put(skb, len), msg, len)) { + err = -EFAULT; + goto drop; + } + + hdr = (void *)skb->data; + + if (__le16_to_cpu(hdr->len) != len - sizeof(*hdr)) { + err = -EINVAL; + goto drop; + } + + if (__le16_to_cpu(hdr->opcode) == 0x0000) { + __u8 priority = skb->data[sizeof(*hdr)]; + __u8 ident_len = skb->data[sizeof(*hdr) + 1]; + + /* Only the priorities 0-7 are valid and with that any other + * value results in an invalid packet. + * + * The priority byte is followed by an ident length byte and + * the NUL terminated ident string. Check that the ident + * length is not overflowing the packet and also that the + * ident string itself is NUL terminated. In case the ident + * length is zero, the length value actually doubles as NUL + * terminator identifier. + * + * The message follows the ident string (if present) and + * must be NUL terminated. Otherwise it is not a valid packet. + */ + if (priority > 7 || skb->data[len - 1] != 0x00 || + ident_len > len - sizeof(*hdr) - 3 || + skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) { + err = -EINVAL; + goto drop; + } + } else { + err = -EINVAL; + goto drop; + } + + index = __le16_to_cpu(hdr->index); + + if (index != MGMT_INDEX_NONE) { + hdev = hci_dev_get(index); + if (!hdev) { + err = -ENODEV; + goto drop; + } + } else { + hdev = NULL; + } + + hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); + + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); + err = len; + + if (hdev) + hci_dev_put(hdev); + +drop: + kfree_skb(skb); + return err; +} + static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1179,6 +1316,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; goto done; + case HCI_CHANNEL_LOGGING: + err = hci_logging_frame(sk, msg, len); + goto done; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); @@ -1211,7 +1351,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto drop; } - bt_cb(skb)->pkt_type = *((unsigned char *) skb->data); + hci_skb_pkt_type(skb) = skb->data[0]; skb_pull(skb, 1); if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { @@ -1220,16 +1360,16 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, * * However check that the packet type is valid. */ - if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT && - bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && + hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { err = -EINVAL; goto drop; } skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); - } else if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { + } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) { u16 opcode = get_unaligned_le16(skb->data); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); @@ -1242,6 +1382,11 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto drop; } + /* Since the opcode has already been extracted here, store + * a copy of the value for later use by the drivers. + */ + hci_skb_opcode(skb) = opcode; + if (ogf == 0x3f) { skb_queue_tail(&hdev->raw_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); @@ -1249,7 +1394,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* Stand-alone HCI commands must be flagged as * single-command requests. */ - bt_cb(skb)->hci.req_start = true; + bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -1260,8 +1405,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, goto drop; } - if (bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT && - bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) { + if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && + hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) { err = -EINVAL; goto drop; } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 66e8b6ee19a5..39a5149f3010 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -6538,8 +6538,6 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff *skb) { - int err = 0; - BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb, chan->rx_state); @@ -6570,7 +6568,7 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, chan->last_acked_seq = control->txseq; chan->expected_tx_seq = __next_seq(chan, control->txseq); - return err; + return 0; } static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) @@ -7113,8 +7111,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan->dcid = cid; if (bdaddr_type_is_le(dst_type)) { - u8 role; - /* Convert from L2CAP channel address type to HCI address type */ if (dst_type == BDADDR_LE_PUBLIC) @@ -7123,14 +7119,15 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, dst_type = ADDR_LE_DEV_RANDOM; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - role = HCI_ROLE_SLAVE; + hcon = hci_connect_le(hdev, dst, dst_type, + chan->sec_level, + HCI_LE_CONN_TIMEOUT, + HCI_ROLE_SLAVE); else - role = HCI_ROLE_MASTER; + hcon = hci_connect_le_scan(hdev, dst, dst_type, + chan->sec_level, + HCI_LE_CONN_TIMEOUT); - hcon = hci_connect_le_scan(hdev, dst, dst_type, - chan->sec_level, - HCI_LE_CONN_TIMEOUT, - role); } else { u8 auth_type = l2cap_get_auth_type(chan); hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7f22119276f3..5a5089cb6570 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 10 +#define MGMT_REVISION 11 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -102,6 +102,8 @@ static const u16 mgmt_commands[] = { MGMT_OP_READ_ADV_FEATURES, MGMT_OP_ADD_ADVERTISING, MGMT_OP_REMOVE_ADVERTISING, + MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_OP_START_LIMITED_DISCOVERY, }; static const u16 mgmt_events[] = { @@ -718,116 +720,6 @@ static u32 get_current_settings(struct hci_dev *hdev) return settings; } -#define PNP_INFO_SVCLASS_ID 0x1200 - -static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 4) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - u16 uuid16; - - if (uuid->size != 16) - continue; - - uuid16 = get_unaligned_le16(&uuid->uuid[12]); - if (uuid16 < 0x1100) - continue; - - if (uuid16 == PNP_INFO_SVCLASS_ID) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID16_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u16) > len) { - uuids_start[1] = EIR_UUID16_SOME; - break; - } - - *ptr++ = (uuid16 & 0x00ff); - *ptr++ = (uuid16 & 0xff00) >> 8; - uuids_start[0] += sizeof(uuid16); - } - - return ptr; -} - -static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 6) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 32) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID32_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u32) > len) { - uuids_start[1] = EIR_UUID32_SOME; - break; - } - - memcpy(ptr, &uuid->uuid[12], sizeof(u32)); - ptr += sizeof(u32); - uuids_start[0] += sizeof(u32); - } - - return ptr; -} - -static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 18) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 128) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID128_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + 16 > len) { - uuids_start[1] = EIR_UUID128_SOME; - break; - } - - memcpy(ptr, uuid->uuid, 16); - ptr += 16; - uuids_start[0] += 16; - } - - return ptr; -} - static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev) { return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev); @@ -840,98 +732,7 @@ static struct mgmt_pending_cmd *pending_find_data(u16 opcode, return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data); } -static u8 get_current_adv_instance(struct hci_dev *hdev) -{ - /* The "Set Advertising" setting supersedes the "Add Advertising" - * setting. Here we set the advertising data based on which - * setting was set. When neither apply, default to the global settings, - * represented by instance "0". - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return hdev->cur_adv_instance; - - return 0x00; -} - -static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) -{ - u8 ad_len = 0; - size_t name_len; - - name_len = strlen(hdev->dev_name); - if (name_len > 0) { - size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2; - - if (name_len > max_len) { - name_len = max_len; - ptr[1] = EIR_NAME_SHORT; - } else - ptr[1] = EIR_NAME_COMPLETE; - - ptr[0] = name_len + 1; - - memcpy(ptr + 2, hdev->dev_name, name_len); - - ad_len += (name_len + 2); - ptr += (name_len + 2); - } - - return ad_len; -} - -static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, - u8 *ptr) -{ - struct adv_info *adv_instance; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - - /* TODO: Set the appropriate entries based on advertising instance flags - * here once flags other than 0 are supported. - */ - memcpy(ptr, adv_instance->scan_rsp_data, - adv_instance->scan_rsp_len); - - return adv_instance->scan_rsp_len; -} - -static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_rsp_data cp; - u8 len; - - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - return; - - memset(&cp, 0, sizeof(cp)); - - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, cp.data); - else - len = create_default_scan_rsp_data(hdev, cp.data); - - if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) - return; - - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); - hdev->scan_rsp_data_len = len; - - cp.length = len; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); -} - -static void update_scan_rsp_data(struct hci_request *req) -{ - update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev)); -} - -static u8 get_adv_discov_flags(struct hci_dev *hdev) +u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; @@ -955,7 +756,7 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev) return 0; } -static bool get_connectable(struct hci_dev *hdev) +bool mgmt_get_connectable(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; @@ -972,344 +773,6 @@ static bool get_connectable(struct hci_dev *hdev) return hci_dev_test_flag(hdev, HCI_CONNECTABLE); } -static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) -{ - u32 flags; - struct adv_info *adv_instance; - - if (instance == 0x00) { - /* Instance 0 always manages the "Tx Power" and "Flags" - * fields - */ - flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; - - /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting - * corresponds to the "connectable" instance flag. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) - flags |= MGMT_ADV_FLAG_CONNECTABLE; - - return flags; - } - - adv_instance = hci_find_adv_instance(hdev, instance); - - /* Return 0 when we got an invalid instance identifier. */ - if (!adv_instance) - return 0; - - return adv_instance->flags; -} - -static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) -{ - u8 instance = get_current_adv_instance(hdev); - struct adv_info *adv_instance; - - /* Ignore instance 0 */ - if (instance == 0x00) - return 0; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - - /* TODO: Take into account the "appearance" and "local-name" flags here. - * These are currently being ignored as they are not supported. - */ - return adv_instance->scan_rsp_len; -} - -static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) -{ - struct adv_info *adv_instance = NULL; - u8 ad_len = 0, flags = 0; - u32 instance_flags; - - /* Return 0 when the current instance identifier is invalid. */ - if (instance) { - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - } - - instance_flags = get_adv_instance_flags(hdev, instance); - - /* The Add Advertising command allows userspace to set both the general - * and limited discoverable flags. - */ - if (instance_flags & MGMT_ADV_FLAG_DISCOV) - flags |= LE_AD_GENERAL; - - if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) - flags |= LE_AD_LIMITED; - - if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { - /* If a discovery flag wasn't provided, simply use the global - * settings. - */ - if (!flags) - flags |= get_adv_discov_flags(hdev); - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - flags |= LE_AD_NO_BREDR; - - /* If flags would still be empty, then there is no need to - * include the "Flags" AD field". - */ - if (flags) { - ptr[0] = 0x02; - ptr[1] = EIR_FLAGS; - ptr[2] = flags; - - ad_len += 3; - ptr += 3; - } - } - - if (adv_instance) { - memcpy(ptr, adv_instance->adv_data, - adv_instance->adv_data_len); - ad_len += adv_instance->adv_data_len; - ptr += adv_instance->adv_data_len; - } - - /* Provide Tx Power only if we can provide a valid value for it */ - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && - (instance_flags & MGMT_ADV_FLAG_TX_POWER)) { - ptr[0] = 0x02; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8)hdev->adv_tx_power; - - ad_len += 3; - ptr += 3; - } - - return ad_len; -} - -static void update_inst_adv_data(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_data cp; - u8 len; - - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - return; - - memset(&cp, 0, sizeof(cp)); - - len = create_instance_adv_data(hdev, instance, cp.data); - - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) - return; - - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); - hdev->adv_data_len = len; - - cp.length = len; - - hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); -} - -static void update_adv_data(struct hci_request *req) -{ - update_inst_adv_data(req, get_current_adv_instance(req->hdev)); -} - -int mgmt_update_adv_data(struct hci_dev *hdev) -{ - struct hci_request req; - - hci_req_init(&req, hdev); - update_adv_data(&req); - - return hci_req_run(&req, NULL); -} - -static void create_eir(struct hci_dev *hdev, u8 *data) -{ - u8 *ptr = data; - size_t name_len; - - name_len = strlen(hdev->dev_name); - - if (name_len > 0) { - /* EIR Data type */ - if (name_len > 48) { - name_len = 48; - ptr[1] = EIR_NAME_SHORT; - } else - ptr[1] = EIR_NAME_COMPLETE; - - /* EIR Data length */ - ptr[0] = name_len + 1; - - memcpy(ptr + 2, hdev->dev_name, name_len); - - ptr += (name_len + 2); - } - - if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 2; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8) hdev->inq_tx_power; - - ptr += 3; - } - - if (hdev->devid_source > 0) { - ptr[0] = 9; - ptr[1] = EIR_DEVICE_ID; - - put_unaligned_le16(hdev->devid_source, ptr + 2); - put_unaligned_le16(hdev->devid_vendor, ptr + 4); - put_unaligned_le16(hdev->devid_product, ptr + 6); - put_unaligned_le16(hdev->devid_version, ptr + 8); - - ptr += 10; - } - - ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); -} - -static void update_eir(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_eir cp; - - if (!hdev_is_powered(hdev)) - return; - - if (!lmp_ext_inq_capable(hdev)) - return; - - if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) - return; - - if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) - return; - - memset(&cp, 0, sizeof(cp)); - - create_eir(hdev, cp.data); - - if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) - return; - - memcpy(hdev->eir, cp.data, sizeof(cp.data)); - - hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); -} - -static u8 get_service_classes(struct hci_dev *hdev) -{ - struct bt_uuid *uuid; - u8 val = 0; - - list_for_each_entry(uuid, &hdev->uuids, list) - val |= uuid->svc_hint; - - return val; -} - -static void update_class(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 cod[3]; - - BT_DBG("%s", hdev->name); - - if (!hdev_is_powered(hdev)) - return; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - return; - - if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) - return; - - cod[0] = hdev->minor_class; - cod[1] = hdev->major_class; - cod[2] = get_service_classes(hdev); - - if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) - cod[1] |= 0x20; - - if (memcmp(cod, hdev->dev_class, 3) == 0) - return; - - hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); -} - -static void disable_advertising(struct hci_request *req) -{ - u8 enable = 0x00; - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - -static void enable_advertising(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_param cp; - u8 own_addr_type, enable = 0x01; - bool connectable; - u8 instance; - u32 flags; - - if (hci_conn_num(hdev, LE_LINK) > 0) - return; - - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - disable_advertising(req); - - /* Clear the HCI_LE_ADV bit temporarily so that the - * hci_update_random_address knows that it's safe to go ahead - * and write a new random address. The flag will be set back on - * as soon as the SET_ADV_ENABLE HCI command completes. - */ - hci_dev_clear_flag(hdev, HCI_LE_ADV); - - instance = get_current_adv_instance(hdev); - flags = get_adv_instance_flags(hdev, instance); - - /* If the "connectable" instance flag was not set, then choose between - * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. - */ - connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || - get_connectable(hdev); - - /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. - */ - if (hci_update_random_address(req, !connectable, &own_addr_type) < 0) - return; - - memset(&cp, 0, sizeof(cp)); - cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval); - cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval); - - if (connectable) - cp.type = LE_ADV_IND; - else if (get_cur_adv_instance_scan_rsp_len(hdev)) - cp.type = LE_ADV_SCAN_IND; - else - cp.type = LE_ADV_NONCONN_IND; - - cp.own_address_type = own_addr_type; - cp.channel_map = hdev->le_adv_channel_map; - - hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - static void service_cache_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, @@ -1323,8 +786,8 @@ static void service_cache_off(struct work_struct *work) hci_dev_lock(hdev); - update_eir(&req); - update_class(&req); + __hci_req_update_eir(&req); + __hci_req_update_class(&req); hci_dev_unlock(hdev); @@ -1345,10 +808,11 @@ static void rpa_expired(struct work_struct *work) return; /* The generation of a new RPA and programming it into the - * controller happens in the enable_advertising() function. + * controller happens in the hci_req_enable_advertising() + * function. */ hci_req_init(&req, hdev); - enable_advertising(&req); + __hci_req_enable_advertising(&req); hci_req_run(&req, NULL); } @@ -1416,51 +880,7 @@ static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode) } } -static bool hci_stop_discovery(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_remote_name_req_cancel cp; - struct inquiry_entry *e; - - switch (hdev->discovery.state) { - case DISCOVERY_FINDING: - if (test_bit(HCI_INQUIRY, &hdev->flags)) - hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); - - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - cancel_delayed_work(&hdev->le_scan_disable); - hci_req_add_le_scan_disable(req); - } - - return true; - - case DISCOVERY_RESOLVING: - e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, - NAME_PENDING); - if (!e) - break; - - bacpy(&cp.bdaddr, &e->data.bdaddr); - hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), - &cp); - - return true; - - default: - /* Passive scanning */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(req); - return true; - } - - break; - } - - return false; -} - -static void advertising_added(struct sock *sk, struct hci_dev *hdev, - u8 instance) +void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance) { struct mgmt_ev_advertising_added ev; @@ -1469,8 +889,8 @@ static void advertising_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk); } -static void advertising_removed(struct sock *sk, struct hci_dev *hdev, - u8 instance) +void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, + u8 instance) { struct mgmt_ev_advertising_removed ev; @@ -1479,65 +899,6 @@ static void advertising_removed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk); } -static int schedule_adv_instance(struct hci_request *req, u8 instance, - bool force) { - struct hci_dev *hdev = req->hdev; - struct adv_info *adv_instance = NULL; - u16 timeout; - - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) - return -EPERM; - - if (hdev->adv_instance_timeout) - return -EBUSY; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return -ENOENT; - - /* A zero timeout means unlimited advertising. As long as there is - * only one instance, duration should be ignored. We still set a timeout - * in case further instances are being added later on. - * - * If the remaining lifetime of the instance is more than the duration - * then the timeout corresponds to the duration, otherwise it will be - * reduced to the remaining instance lifetime. - */ - if (adv_instance->timeout == 0 || - adv_instance->duration <= adv_instance->remaining_time) - timeout = adv_instance->duration; - else - timeout = adv_instance->remaining_time; - - /* The remaining time is being reduced unless the instance is being - * advertised without time limit. - */ - if (adv_instance->timeout) - adv_instance->remaining_time = - adv_instance->remaining_time - timeout; - - hdev->adv_instance_timeout = timeout; - queue_delayed_work(hdev->workqueue, - &hdev->adv_instance_expire, - msecs_to_jiffies(timeout * 1000)); - - /* If we're just re-scheduling the same instance again then do not - * execute any HCI commands. This happens when a single instance is - * being advertised. - */ - if (!force && hdev->cur_adv_instance == instance && - hci_dev_test_flag(hdev, HCI_LE_ADV)) - return 0; - - hdev->cur_adv_instance = instance; - update_adv_data(req); - update_scan_rsp_data(req); - enable_advertising(req); - - return 0; -} - static void cancel_adv_timeout(struct hci_dev *hdev) { if (hdev->adv_instance_timeout) { @@ -1546,76 +907,6 @@ static void cancel_adv_timeout(struct hci_dev *hdev) } } -/* For a single instance: - * - force == true: The instance will be removed even when its remaining - * lifetime is not zero. - * - force == false: the instance will be deactivated but kept stored unless - * the remaining lifetime is zero. - * - * For instance == 0x00: - * - force == true: All instances will be removed regardless of their timeout - * setting. - * - force == false: Only instances that have a timeout will be removed. - */ -static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, - u8 instance, bool force) -{ - struct adv_info *adv_instance, *n, *next_instance = NULL; - int err; - u8 rem_inst; - - /* Cancel any timeout concerning the removed instance(s). */ - if (!instance || hdev->cur_adv_instance == instance) - cancel_adv_timeout(hdev); - - /* Get the next instance to advertise BEFORE we remove - * the current one. This can be the same instance again - * if there is only one instance. - */ - if (instance && hdev->cur_adv_instance == instance) - next_instance = hci_get_next_instance(hdev, instance); - - if (instance == 0x00) { - list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, - list) { - if (!(force || adv_instance->timeout)) - continue; - - rem_inst = adv_instance->instance; - err = hci_remove_adv_instance(hdev, rem_inst); - if (!err) - advertising_removed(NULL, hdev, rem_inst); - } - hdev->cur_adv_instance = 0x00; - } else { - adv_instance = hci_find_adv_instance(hdev, instance); - - if (force || (adv_instance && adv_instance->timeout && - !adv_instance->remaining_time)) { - /* Don't advertise a removed instance. */ - if (next_instance && - next_instance->instance == instance) - next_instance = NULL; - - err = hci_remove_adv_instance(hdev, instance); - if (!err) - advertising_removed(NULL, hdev, instance); - } - } - - if (list_empty(&hdev->adv_instances)) { - hdev->cur_adv_instance = 0x00; - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); - } - - if (!req || !hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return; - - if (next_instance) - schedule_adv_instance(req, next_instance->instance, false); -} - static int clean_up_hci_state(struct hci_dev *hdev) { struct hci_request req; @@ -1631,12 +922,12 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - clear_adv_instance(hdev, NULL, 0x00, false); + hci_req_clear_adv_instance(hdev, NULL, 0x00, false); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - disable_advertising(&req); + __hci_req_disable_advertising(&req); - discov_stopped = hci_stop_discovery(&req); + discov_stopped = hci_req_stop_discovery(&req); list_for_each_entry(conn, &hdev->conn_hash.list, list) { /* 0x15 == Terminated due to Power Off */ @@ -1671,17 +962,6 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { - cancel_delayed_work(&hdev->power_off); - - if (cp->val) { - mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, - data, len); - err = mgmt_powered(hdev, 1); - goto failed; - } - } - if (!!cp->val == hdev_is_powered(hdev)) { err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev); goto failed; @@ -1805,13 +1085,9 @@ static u8 mgmt_le_support(struct hci_dev *hdev) return MGMT_STATUS_SUCCESS; } -static void set_discoverable_complete(struct hci_dev *hdev, u8 status, - u16 opcode) +void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - struct mgmt_mode *cp; - struct hci_request req; - bool changed; BT_DBG("status 0x%02x", status); @@ -1828,33 +1104,14 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status, goto remove_cmd; } - cp = cmd->param; - if (cp->val) { - changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); - - if (hdev->discov_timeout > 0) { - int to = msecs_to_jiffies(hdev->discov_timeout * 1000); - queue_delayed_work(hdev->workqueue, &hdev->discov_off, - to); - } - } else { - changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); + if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && + hdev->discov_timeout > 0) { + int to = msecs_to_jiffies(hdev->discov_timeout * 1000); + queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to); } send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev); - - if (changed) - new_settings(hdev, cmd->sk); - - /* When the discoverable mode gets changed, make sure - * that class of device has the limited discoverable - * bit correctly set. Also update page scan based on whitelist - * entries. - */ - hci_req_init(&req, hdev); - __hci_update_page_scan(&req); - update_class(&req); - hci_req_run(&req, NULL); + new_settings(hdev, cmd->sk); remove_cmd: mgmt_pending_remove(cmd); @@ -1868,9 +1125,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_set_discoverable *cp = data; struct mgmt_pending_cmd *cmd; - struct hci_request req; u16 timeout; - u8 scan; int err; BT_DBG("request for %s", hdev->name); @@ -1949,8 +1204,8 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, if (cp->val && hdev->discov_timeout > 0) { int to = msecs_to_jiffies(hdev->discov_timeout * 1000); - queue_delayed_work(hdev->workqueue, &hdev->discov_off, - to); + queue_delayed_work(hdev->req_workqueue, + &hdev->discov_off, to); } err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev); @@ -1970,105 +1225,28 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = timeout; + if (cp->val) + hci_dev_set_flag(hdev, HCI_DISCOVERABLE); + else + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); + /* Limited discoverable mode */ if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE); else hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); - hci_req_init(&req, hdev); - - /* The procedure for LE-only controllers is much simpler - just - * update the advertising data. - */ - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - goto update_ad; - - scan = SCAN_PAGE; - - if (cp->val) { - struct hci_cp_write_current_iac_lap hci_cp; - - if (cp->val == 0x02) { - /* Limited discoverable mode */ - hci_cp.num_iac = min_t(u8, hdev->num_iac, 2); - hci_cp.iac_lap[0] = 0x00; /* LIAC */ - hci_cp.iac_lap[1] = 0x8b; - hci_cp.iac_lap[2] = 0x9e; - hci_cp.iac_lap[3] = 0x33; /* GIAC */ - hci_cp.iac_lap[4] = 0x8b; - hci_cp.iac_lap[5] = 0x9e; - } else { - /* General discoverable mode */ - hci_cp.num_iac = 1; - hci_cp.iac_lap[0] = 0x33; /* GIAC */ - hci_cp.iac_lap[1] = 0x8b; - hci_cp.iac_lap[2] = 0x9e; - } - - hci_req_add(&req, HCI_OP_WRITE_CURRENT_IAC_LAP, - (hci_cp.num_iac * 3) + 1, &hci_cp); - - scan |= SCAN_INQUIRY; - } else { - hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); - } - - hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, sizeof(scan), &scan); - -update_ad: - update_adv_data(&req); - - err = hci_req_run(&req, set_discoverable_complete); - if (err < 0) - mgmt_pending_remove(cmd); + queue_work(hdev->req_workqueue, &hdev->discoverable_update); + err = 0; failed: hci_dev_unlock(hdev); return err; } -static void write_fast_connectable(struct hci_request *req, bool enable) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_page_scan_activity acp; - u8 type; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - return; - - if (hdev->hci_ver < BLUETOOTH_VER_1_2) - return; - - if (enable) { - type = PAGE_SCAN_TYPE_INTERLACED; - - /* 160 msec page scan interval */ - acp.interval = cpu_to_le16(0x0100); - } else { - type = PAGE_SCAN_TYPE_STANDARD; /* default */ - - /* default 1.28 sec page scan */ - acp.interval = cpu_to_le16(0x0800); - } - - acp.window = cpu_to_le16(0x0012); - - if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || - __cpu_to_le16(hdev->page_scan_window) != acp.window) - hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, - sizeof(acp), &acp); - - if (hdev->page_scan_type != type) - hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); -} - -static void set_connectable_complete(struct hci_dev *hdev, u8 status, - u16 opcode) +void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - struct mgmt_mode *cp; - bool conn_changed, discov_changed; BT_DBG("status 0x%02x", status); @@ -2084,27 +1262,8 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status, goto remove_cmd; } - cp = cmd->param; - if (cp->val) { - conn_changed = !hci_dev_test_and_set_flag(hdev, - HCI_CONNECTABLE); - discov_changed = false; - } else { - conn_changed = hci_dev_test_and_clear_flag(hdev, - HCI_CONNECTABLE); - discov_changed = hci_dev_test_and_clear_flag(hdev, - HCI_DISCOVERABLE); - } - send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev); - - if (conn_changed || discov_changed) { - new_settings(hdev, cmd->sk); - hci_update_page_scan(hdev); - if (discov_changed) - mgmt_update_adv_data(hdev); - hci_update_background_scan(hdev); - } + new_settings(hdev, cmd->sk); remove_cmd: mgmt_pending_remove(cmd); @@ -2134,7 +1293,7 @@ static int set_connectable_update_settings(struct hci_dev *hdev, return err; if (changed) { - hci_update_page_scan(hdev); + hci_req_update_scan(hdev); hci_update_background_scan(hdev); return new_settings(hdev, sk); } @@ -2147,8 +1306,6 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; - struct hci_request req; - u8 scan; int err; BT_DBG("request for %s", hdev->name); @@ -2182,57 +1339,19 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - hci_req_init(&req, hdev); - - /* If BR/EDR is not enabled and we disable advertising as a - * by-product of disabling connectable, we need to update the - * advertising flags. - */ - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { - if (!cp->val) { - hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); - hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); - } - update_adv_data(&req); - } else if (cp->val != test_bit(HCI_PSCAN, &hdev->flags)) { - if (cp->val) { - scan = SCAN_PAGE; - } else { - /* If we don't have any whitelist entries just - * disable all scanning. If there are entries - * and we had both page and inquiry scanning - * enabled then fall back to only page scanning. - * Otherwise no changes are needed. - */ - if (list_empty(&hdev->whitelist)) - scan = SCAN_DISABLED; - else if (test_bit(HCI_ISCAN, &hdev->flags)) - scan = SCAN_PAGE; - else - goto no_scan_update; - - if (test_bit(HCI_ISCAN, &hdev->flags) && - hdev->discov_timeout > 0) - cancel_delayed_work(&hdev->discov_off); - } + if (cp->val) { + hci_dev_set_flag(hdev, HCI_CONNECTABLE); + } else { + if (hdev->discov_timeout > 0) + cancel_delayed_work(&hdev->discov_off); - hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); + hci_dev_clear_flag(hdev, HCI_CONNECTABLE); } -no_scan_update: - /* Update the advertising parameters if necessary */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) - enable_advertising(&req); - - err = hci_req_run(&req, set_connectable_complete); - if (err < 0) { - mgmt_pending_remove(cmd); - if (err == -ENODATA) - err = set_connectable_update_settings(hdev, sk, - cp->val); - goto failed; - } + queue_work(hdev->req_workqueue, &hdev->connectable_update); + err = 0; failed: hci_dev_unlock(hdev); @@ -2508,10 +1627,10 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct hci_request req; hci_req_init(&req, hdev); - update_adv_data(&req); - update_scan_rsp_data(&req); - __hci_update_background_scan(&req); + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); hci_req_run(&req, NULL); + hci_update_background_scan(hdev); } unlock: @@ -2560,7 +1679,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) enabled = lmp_host_le_capable(hdev); if (!val) - clear_adv_instance(hdev, NULL, 0x00, true); + hci_req_clear_adv_instance(hdev, NULL, 0x00, true); if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -2607,7 +1726,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_cp.simul = 0x00; } else { if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - disable_advertising(&req); + __hci_req_disable_advertising(&req); } hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp), @@ -2722,8 +1841,8 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) hci_req_init(&req, hdev); - update_class(&req); - update_eir(&req); + __hci_req_update_class(&req); + __hci_req_update_eir(&req); err = hci_req_run(&req, add_uuid_complete); if (err < 0) { @@ -2822,8 +1941,8 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, update_class: hci_req_init(&req, hdev); - update_class(&req); - update_eir(&req); + __hci_req_update_class(&req); + __hci_req_update_eir(&req); err = hci_req_run(&req, remove_uuid_complete); if (err < 0) { @@ -2898,10 +2017,10 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, hci_dev_unlock(hdev); cancel_delayed_work_sync(&hdev->service_cache); hci_dev_lock(hdev); - update_eir(&req); + __hci_req_update_eir(&req); } - update_class(&req); + __hci_req_update_class(&req); err = hci_req_run(&req, set_class_complete); if (err < 0) { @@ -3561,8 +2680,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type, sec_level, - HCI_LE_CONN_TIMEOUT, - HCI_ROLE_MASTER); + HCI_LE_CONN_TIMEOUT); } if (IS_ERR(conn)) { @@ -3803,16 +2921,6 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, HCI_OP_USER_PASSKEY_NEG_REPLY, 0); } -static void update_name(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_local_name cp; - - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); - - hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); -} - static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_cp_set_local_name *cp; @@ -3891,15 +2999,15 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, hci_req_init(&req, hdev); if (lmp_bredr_capable(hdev)) { - update_name(&req); - update_eir(&req); + __hci_req_update_name(&req); + __hci_req_update_eir(&req); } /* The name is stored in the scan response data and so * no need to udpate the advertising data here. */ if (lmp_le_capable(hdev)) - update_scan_rsp_data(&req); + __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); err = hci_req_run(&req, set_name_complete); if (err < 0) @@ -4164,145 +3272,9 @@ done: return err; } -static bool trigger_bredr_inquiry(struct hci_request *req, u8 *status) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_inquiry cp; - /* General inquiry access code (GIAC) */ - u8 lap[3] = { 0x33, 0x8b, 0x9e }; - - *status = mgmt_bredr_support(hdev); - if (*status) - return false; - - if (hci_dev_test_flag(hdev, HCI_INQUIRY)) { - *status = MGMT_STATUS_BUSY; - return false; - } - - hci_inquiry_cache_flush(hdev); - - memset(&cp, 0, sizeof(cp)); - memcpy(&cp.lap, lap, sizeof(cp.lap)); - cp.length = DISCOV_BREDR_INQUIRY_LEN; - - hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); - - return true; -} - -static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_param param_cp; - struct hci_cp_le_set_scan_enable enable_cp; - u8 own_addr_type; - int err; - - *status = mgmt_le_support(hdev); - if (*status) - return false; - - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { - /* Don't let discovery abort an outgoing connection attempt - * that's using directed advertising. - */ - if (hci_lookup_le_connect(hdev)) { - *status = MGMT_STATUS_REJECTED; - return false; - } - - cancel_adv_timeout(hdev); - disable_advertising(req); - } - - /* If controller is scanning, it means the background scanning is - * running. Thus, we should temporarily stop it in order to set the - * discovery scanning parameters. - */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) - hci_req_add_le_scan_disable(req); - - /* All active scans will be done with either a resolvable private - * address (when privacy feature has been enabled) or non-resolvable - * private address. - */ - err = hci_update_random_address(req, true, &own_addr_type); - if (err < 0) { - *status = MGMT_STATUS_FAILED; - return false; - } - - memset(¶m_cp, 0, sizeof(param_cp)); - param_cp.type = LE_SCAN_ACTIVE; - param_cp.interval = cpu_to_le16(interval); - param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN); - param_cp.own_address_type = own_addr_type; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); - - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); - - return true; -} - -static bool trigger_discovery(struct hci_request *req, u8 *status) -{ - struct hci_dev *hdev = req->hdev; - - switch (hdev->discovery.type) { - case DISCOV_TYPE_BREDR: - if (!trigger_bredr_inquiry(req, status)) - return false; - break; - - case DISCOV_TYPE_INTERLEAVED: - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, - &hdev->quirks)) { - /* During simultaneous discovery, we double LE scan - * interval. We must leave some time for the controller - * to do BR/EDR inquiry. - */ - if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT * 2, - status)) - return false; - - if (!trigger_bredr_inquiry(req, status)) - return false; - - return true; - } - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { - *status = MGMT_STATUS_NOT_SUPPORTED; - return false; - } - /* fall through */ - - case DISCOV_TYPE_LE: - if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT, status)) - return false; - break; - - default: - *status = MGMT_STATUS_INVALID_PARAMS; - return false; - } - - return true; -} - -static void start_discovery_complete(struct hci_dev *hdev, u8 status, - u16 opcode) +void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - unsigned long timeout; BT_DBG("status %d", status); @@ -4312,75 +3284,49 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status, if (!cmd) cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev); + if (!cmd) + cmd = pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev); + if (cmd) { cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } - if (status) { - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - goto unlock; - } - - hci_discovery_set_state(hdev, DISCOVERY_FINDING); + hci_dev_unlock(hdev); +} - /* If the scan involves LE scan, pick proper timeout to schedule - * hdev->le_scan_disable that will stop it. - */ - switch (hdev->discovery.type) { +static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, + uint8_t *mgmt_status) +{ + switch (type) { case DISCOV_TYPE_LE: - timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); + *mgmt_status = mgmt_le_support(hdev); + if (*mgmt_status) + return false; break; case DISCOV_TYPE_INTERLEAVED: - /* When running simultaneous discovery, the LE scanning time - * should occupy the whole discovery time sine BR/EDR inquiry - * and LE scanning are scheduled by the controller. - * - * For interleaving discovery in comparison, BR/EDR inquiry - * and LE scanning are done sequentially with separate - * timeouts. - */ - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) - timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); - else - timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); - break; + *mgmt_status = mgmt_le_support(hdev); + if (*mgmt_status) + return false; + /* Intentional fall-through */ case DISCOV_TYPE_BREDR: - timeout = 0; + *mgmt_status = mgmt_bredr_support(hdev); + if (*mgmt_status) + return false; break; default: - BT_ERR("Invalid discovery type %d", hdev->discovery.type); - timeout = 0; - break; - } - - if (timeout) { - /* When service discovery is used and the controller has - * a strict duplicate filter, it is important to remember - * the start and duration of the scan. This is required - * for restarting scanning during the discovery phase. - */ - if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, - &hdev->quirks) && - hdev->discovery.result_filtering) { - hdev->discovery.scan_start = jiffies; - hdev->discovery.scan_duration = timeout; - } - - queue_delayed_work(hdev->workqueue, - &hdev->le_scan_disable, timeout); + *mgmt_status = MGMT_STATUS_INVALID_PARAMS; + return false; } -unlock: - hci_dev_unlock(hdev); + return true; } -static int start_discovery(struct sock *sk, struct hci_dev *hdev, - void *data, u16 len) +static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, + u16 op, void *data, u16 len) { struct mgmt_cp_start_discovery *cp = data; struct mgmt_pending_cmd *cmd; - struct hci_request req; u8 status; int err; @@ -4389,7 +3335,7 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, + err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_NOT_POWERED, &cp->type, sizeof(cp->type)); goto failed; @@ -4397,20 +3343,17 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, if (hdev->discovery.state != DISCOVERY_STOPPED || hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, - MGMT_STATUS_BUSY, &cp->type, - sizeof(cp->type)); + err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY, + &cp->type, sizeof(cp->type)); goto failed; } - cmd = mgmt_pending_add(sk, MGMT_OP_START_DISCOVERY, hdev, data, len); - if (!cmd) { - err = -ENOMEM; + if (!discovery_type_is_valid(hdev, cp->type, &status)) { + err = mgmt_cmd_complete(sk, hdev->id, op, status, + &cp->type, sizeof(cp->type)); goto failed; } - cmd->cmd_complete = generic_cmd_complete; - /* Clear the discovery filter first to free any previously * allocated memory for the UUID list. */ @@ -4418,29 +3361,43 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev, hdev->discovery.type = cp->type; hdev->discovery.report_invalid_rssi = false; + if (op == MGMT_OP_START_LIMITED_DISCOVERY) + hdev->discovery.limited = true; + else + hdev->discovery.limited = false; - hci_req_init(&req, hdev); - - if (!trigger_discovery(&req, &status)) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY, - status, &cp->type, sizeof(cp->type)); - mgmt_pending_remove(cmd); + cmd = mgmt_pending_add(sk, op, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto failed; } - err = hci_req_run(&req, start_discovery_complete); - if (err < 0) { - mgmt_pending_remove(cmd); - goto failed; - } + cmd->cmd_complete = generic_cmd_complete; hci_discovery_set_state(hdev, DISCOVERY_STARTING); + queue_work(hdev->req_workqueue, &hdev->discov_update); + err = 0; failed: hci_dev_unlock(hdev); return err; } +static int start_discovery(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + return start_discovery_internal(sk, hdev, MGMT_OP_START_DISCOVERY, + data, len); +} + +static int start_limited_discovery(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + return start_discovery_internal(sk, hdev, + MGMT_OP_START_LIMITED_DISCOVERY, + data, len); +} + static int service_discovery_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { @@ -4453,7 +3410,6 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_start_service_discovery *cp = data; struct mgmt_pending_cmd *cmd; - struct hci_request req; const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16); u16 uuid_count, expected_len; u8 status; @@ -4502,6 +3458,13 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, goto failed; } + if (!discovery_type_is_valid(hdev, cp->type, &status)) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_START_SERVICE_DISCOVERY, + status, &cp->type, sizeof(cp->type)); + goto failed; + } + cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { @@ -4534,30 +3497,16 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, } } - hci_req_init(&req, hdev); - - if (!trigger_discovery(&req, &status)) { - err = mgmt_cmd_complete(sk, hdev->id, - MGMT_OP_START_SERVICE_DISCOVERY, - status, &cp->type, sizeof(cp->type)); - mgmt_pending_remove(cmd); - goto failed; - } - - err = hci_req_run(&req, start_discovery_complete); - if (err < 0) { - mgmt_pending_remove(cmd); - goto failed; - } - hci_discovery_set_state(hdev, DISCOVERY_STARTING); + queue_work(hdev->req_workqueue, &hdev->discov_update); + err = 0; failed: hci_dev_unlock(hdev); return err; } -static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode) +void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; @@ -4571,9 +3520,6 @@ static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode) mgmt_pending_remove(cmd); } - if (!status) - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - hci_dev_unlock(hdev); } @@ -4582,7 +3528,6 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_stop_discovery *mgmt_cp = data; struct mgmt_pending_cmd *cmd; - struct hci_request req; int err; BT_DBG("%s", hdev->name); @@ -4611,24 +3556,9 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, cmd->cmd_complete = generic_cmd_complete; - hci_req_init(&req, hdev); - - hci_stop_discovery(&req); - - err = hci_req_run(&req, stop_discovery_complete); - if (!err) { - hci_discovery_set_state(hdev, DISCOVERY_STOPPING); - goto unlock; - } - - mgmt_pending_remove(cmd); - - /* If no HCI commands were sent we're done */ - if (err == -ENODATA) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0, - &mgmt_cp->type, sizeof(mgmt_cp->type)); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - } + hci_discovery_set_state(hdev, DISCOVERY_STOPPING); + queue_work(hdev->req_workqueue, &hdev->discov_update); + err = 0; unlock: hci_dev_unlock(hdev); @@ -4776,7 +3706,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, NULL, 0); hci_req_init(&req, hdev); - update_eir(&req); + __hci_req_update_eir(&req); hci_req_run(&req, NULL); hci_dev_unlock(hdev); @@ -4826,7 +3756,6 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, * set up earlier, then re-enable multi-instance advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) || list_empty(&hdev->adv_instances)) goto unlock; @@ -4842,7 +3771,7 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, hci_req_init(&req, hdev); - err = schedule_adv_instance(&req, instance, true); + err = __hci_req_schedule_adv_instance(&req, instance, true); if (!err) err = hci_req_run(&req, enable_advertising_instance); @@ -4892,6 +3821,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, bool changed; if (cp->val) { + hdev->cur_adv_instance = 0x00; changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING); if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); @@ -4939,11 +3869,12 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, * We cannot use update_[adv|scan_rsp]_data() here as the * HCI_ADVERTISING flag is not yet set. */ - update_inst_adv_data(&req, 0x00); - update_inst_scan_rsp_data(&req, 0x00); - enable_advertising(&req); + hdev->cur_adv_instance = 0x00; + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); } else { - disable_advertising(&req); + __hci_req_disable_advertising(&req); } err = hci_req_run(&req, set_advertising_complete); @@ -5140,7 +4071,7 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - write_fast_connectable(&req, cp->val); + __hci_req_write_fast_connectable(&req, cp->val); err = hci_req_run(&req, fast_connectable_complete); if (err < 0) { @@ -5275,20 +4206,20 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) goto unlock; } - /* We need to flip the bit already here so that update_adv_data - * generates the correct flags. + /* We need to flip the bit already here so that + * hci_req_update_adv_data generates the correct flags. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); hci_req_init(&req, hdev); - write_fast_connectable(&req, false); - __hci_update_page_scan(&req); + __hci_req_write_fast_connectable(&req, false); + __hci_req_update_scan(&req); /* Since only the advertising data flags will change, there * is no need to update the scan response data. */ - update_adv_data(&req); + __hci_req_update_adv_data(&req, hdev->cur_adv_instance); err = hci_req_run(&req, set_bredr_complete); if (err < 0) @@ -6076,10 +5007,9 @@ static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) } /* This function requires the caller holds hdev->lock */ -static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr, +static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, u8 auto_connect) { - struct hci_dev *hdev = req->hdev; struct hci_conn_params *params; params = hci_conn_params_add(hdev, addr, addr_type); @@ -6099,26 +5029,17 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr, */ if (params->explicit_connect) list_add(¶ms->action, &hdev->pend_le_conns); - - __hci_update_background_scan(req); break; case HCI_AUTO_CONN_REPORT: if (params->explicit_connect) list_add(¶ms->action, &hdev->pend_le_conns); else list_add(¶ms->action, &hdev->pend_le_reports); - __hci_update_background_scan(req); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - if (!is_connected(hdev, addr, addr_type)) { + if (!is_connected(hdev, addr, addr_type)) list_add(¶ms->action, &hdev->pend_le_conns); - /* If we are in scan phase of connecting, we were - * already added to pend_le_conns and scanning. - */ - if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT) - __hci_update_background_scan(req); - } break; } @@ -6142,31 +5063,10 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } -static void add_device_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - struct mgmt_pending_cmd *cmd; - - BT_DBG("status 0x%02x", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_ADD_DEVICE, hdev); - if (!cmd) - goto unlock; - - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - -unlock: - hci_dev_unlock(hdev); -} - static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_device *cp = data; - struct mgmt_pending_cmd *cmd; - struct hci_request req; u8 auto_conn, addr_type; int err; @@ -6183,24 +5083,15 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); - hci_req_init(&req, hdev); - hci_dev_lock(hdev); - cmd = mgmt_pending_add(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); - if (!cmd) { - err = -ENOMEM; - goto unlock; - } - - cmd->cmd_complete = addr_cmd_complete; - if (cp->addr.type == BDADDR_BREDR) { /* Only incoming connections action is supported for now */ if (cp->action != 0x01) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } @@ -6209,7 +5100,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, if (err) goto unlock; - __hci_update_page_scan(&req); + hci_req_update_scan(hdev); goto added; } @@ -6229,33 +5120,31 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, * hci_conn_params_lookup. */ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { - err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ - if (hci_conn_params_set(&req, &cp->addr.bdaddr, addr_type, + if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, auto_conn) < 0) { - err = cmd->cmd_complete(cmd, MGMT_STATUS_FAILED); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); goto unlock; } + hci_update_background_scan(hdev); + added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); - err = hci_req_run(&req, add_device_complete); - if (err < 0) { - /* ENODATA means no HCI commands were needed (e.g. if - * the adapter is powered off). - */ - if (err == -ENODATA) - err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS); - mgmt_pending_remove(cmd); - } + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_SUCCESS, &cp->addr, + sizeof(cp->addr)); unlock: hci_dev_unlock(hdev); @@ -6273,55 +5162,25 @@ static void device_removed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk); } -static void remove_device_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - struct mgmt_pending_cmd *cmd; - - BT_DBG("status 0x%02x", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_REMOVE_DEVICE, hdev); - if (!cmd) - goto unlock; - - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - -unlock: - hci_dev_unlock(hdev); -} - static int remove_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_device *cp = data; - struct mgmt_pending_cmd *cmd; - struct hci_request req; int err; BT_DBG("%s", hdev->name); - hci_req_init(&req, hdev); - hci_dev_lock(hdev); - cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_DEVICE, hdev, data, len); - if (!cmd) { - err = -ENOMEM; - goto unlock; - } - - cmd->cmd_complete = addr_cmd_complete; - if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { struct hci_conn_params *params; u8 addr_type; if (!bdaddr_type_is_valid(cp->addr.type)) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } @@ -6330,13 +5189,15 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, &cp->addr.bdaddr, cp->addr.type); if (err) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, + sizeof(cp->addr)); goto unlock; } - __hci_update_page_scan(&req); + hci_req_update_scan(hdev); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); @@ -6351,33 +5212,36 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, * hci_conn_params_lookup. */ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (!params) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } list_del(¶ms->action); list_del(¶ms->list); kfree(params); - __hci_update_background_scan(&req); + hci_update_background_scan(hdev); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); } else { @@ -6385,9 +5249,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, struct bdaddr_list *b, *btmp; if (cp->addr.type) { - err = cmd->cmd_complete(cmd, - MGMT_STATUS_INVALID_PARAMS); - mgmt_pending_remove(cmd); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_INVALID_PARAMS, + &cp->addr, sizeof(cp->addr)); goto unlock; } @@ -6397,7 +5262,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, kfree(b); } - __hci_update_page_scan(&req); + hci_req_update_scan(hdev); list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) { if (p->auto_connect == HCI_AUTO_CONN_DISABLED) @@ -6414,20 +5279,13 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, BT_DBG("All LE connection parameters were removed"); - __hci_update_background_scan(&req); + hci_update_background_scan(hdev); } complete: - err = hci_req_run(&req, remove_device_complete); - if (err < 0) { - /* ENODATA means no HCI commands were needed (e.g. if - * the adapter is powered off). - */ - if (err == -ENODATA) - err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS); - mgmt_pending_remove(cmd); - } - + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, + MGMT_STATUS_SUCCESS, &cp->addr, + sizeof(cp->addr)); unlock: hci_dev_unlock(hdev); return err; @@ -6898,7 +5756,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, rand, sizeof(rand)); } - flags = get_adv_discov_flags(hdev); + flags = mgmt_get_adv_discov_flags(hdev); if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) flags |= LE_AD_NO_BREDR; @@ -6953,10 +5811,10 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_read_adv_features *rp; size_t rp_len; - int err, i; - bool instance; + int err; struct adv_info *adv_instance; u32 supported_flags; + u8 *instance; BT_DBG("%s", hdev->name); @@ -6966,12 +5824,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); - rp_len = sizeof(*rp); - - instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE); - if (instance) - rp_len += hdev->adv_instance_cnt; - + rp_len = sizeof(*rp) + hdev->adv_instance_cnt; rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { hci_dev_unlock(hdev); @@ -6984,19 +5837,12 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, rp->max_adv_data_len = HCI_MAX_AD_LENGTH; rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; rp->max_instances = HCI_MAX_ADV_INSTANCES; + rp->num_instances = hdev->adv_instance_cnt; - if (instance) { - i = 0; - list_for_each_entry(adv_instance, &hdev->adv_instances, list) { - if (i >= hdev->adv_instance_cnt) - break; - - rp->instance[i] = adv_instance->instance; - i++; - } - rp->num_instances = hdev->adv_instance_cnt; - } else { - rp->num_instances = 0; + instance = rp->instance; + list_for_each_entry(adv_instance, &hdev->adv_instances, list) { + *instance = adv_instance->instance; + instance++; } hci_dev_unlock(hdev); @@ -7016,17 +5862,19 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, int i, cur_len; bool flags_managed = false; bool tx_power_managed = false; - u32 flags_params = MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | - MGMT_ADV_FLAG_MANAGED_FLAGS; - if (is_adv_data && (adv_flags & flags_params)) { - flags_managed = true; - max_len -= 3; - } + if (is_adv_data) { + if (adv_flags & (MGMT_ADV_FLAG_DISCOV | + MGMT_ADV_FLAG_LIMITED_DISCOV | + MGMT_ADV_FLAG_MANAGED_FLAGS)) { + flags_managed = true; + max_len -= 3; + } - if (is_adv_data && (adv_flags & MGMT_ADV_FLAG_TX_POWER)) { - tx_power_managed = true; - max_len -= 3; + if (adv_flags & MGMT_ADV_FLAG_TX_POWER) { + tx_power_managed = true; + max_len -= 3; + } } if (len > max_len) @@ -7067,9 +5915,6 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev); - if (status) - hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE); - list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { if (!adv_instance->pending) continue; @@ -7085,7 +5930,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, cancel_adv_timeout(hdev); hci_remove_adv_instance(hdev, instance); - advertising_removed(cmd ? cmd->sk : NULL, hdev, instance); + mgmt_advertising_removed(cmd ? cmd->sk : NULL, hdev, instance); } if (!cmd) @@ -7107,31 +5952,6 @@ unlock: hci_dev_unlock(hdev); } -void mgmt_adv_timeout_expired(struct hci_dev *hdev) -{ - u8 instance; - struct hci_request req; - - hdev->adv_instance_timeout = 0; - - instance = get_current_adv_instance(hdev); - if (instance == 0x00) - return; - - hci_dev_lock(hdev); - hci_req_init(&req, hdev); - - clear_adv_instance(hdev, &req, instance, false); - - if (list_empty(&hdev->adv_instances)) - disable_advertising(&req); - - if (!skb_queue_empty(&req.cmd_q)) - hci_req_run(&req, NULL); - - hci_dev_unlock(hdev); -} - static int add_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -7155,6 +5975,10 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, status); + if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_INVALID_PARAMS); + flags = __le32_to_cpu(cp->flags); timeout = __le16_to_cpu(cp->timeout); duration = __le16_to_cpu(cp->duration); @@ -7206,9 +6030,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, * actually added. */ if (hdev->adv_instance_cnt > prev_instance_cnt) - advertising_added(sk, hdev, cp->instance); - - hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE); + mgmt_advertising_added(sk, hdev, cp->instance); if (hdev->cur_adv_instance == cp->instance) { /* If the currently advertised instance is being changed then @@ -7253,7 +6075,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - err = schedule_adv_instance(&req, schedule_instance, true); + err = __hci_req_schedule_adv_instance(&req, schedule_instance, true); if (!err) err = hci_req_run(&req, add_advertising_complete); @@ -7325,7 +6147,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) { + if (list_empty(&hdev->adv_instances)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); goto unlock; @@ -7333,10 +6155,10 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - clear_adv_instance(hdev, &req, cp->instance, true); + hci_req_clear_adv_instance(hdev, &req, cp->instance, true); if (list_empty(&hdev->adv_instances)) - disable_advertising(&req); + __hci_req_disable_advertising(&req); /* If no HCI commands have been collected so far or the HCI_ADVERTISING * flag is set or the device isn't powered then we have no HCI @@ -7369,6 +6191,62 @@ unlock: return err; } +static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) +{ + u8 max_len = HCI_MAX_AD_LENGTH; + + if (is_adv_data) { + if (adv_flags & (MGMT_ADV_FLAG_DISCOV | + MGMT_ADV_FLAG_LIMITED_DISCOV | + MGMT_ADV_FLAG_MANAGED_FLAGS)) + max_len -= 3; + + if (adv_flags & MGMT_ADV_FLAG_TX_POWER) + max_len -= 3; + } + + return max_len; +} + +static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_get_adv_size_info *cp = data; + struct mgmt_rp_get_adv_size_info rp; + u32 flags, supported_flags; + int err; + + BT_DBG("%s", hdev->name); + + if (!lmp_le_capable(hdev)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_REJECTED); + + if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_INVALID_PARAMS); + + flags = __le32_to_cpu(cp->flags); + + /* The current implementation only supports a subset of the specified + * flags. + */ + supported_flags = get_supported_adv_flags(hdev); + if (flags & ~supported_flags) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_INVALID_PARAMS); + + rp.instance = cp->instance; + rp.flags = cp->flags; + rp.max_adv_data_len = tlv_data_max_len(flags, true); + rp.max_scan_rsp_len = tlv_data_max_len(flags, false); + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, + MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); + + return err; +} + static const struct hci_mgmt_handler mgmt_handlers[] = { { NULL }, /* 0x0000 (no command) */ { read_version, MGMT_READ_VERSION_SIZE, @@ -7456,6 +6334,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { add_advertising, MGMT_ADD_ADVERTISING_SIZE, HCI_MGMT_VAR_LEN }, { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, + { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, + { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -7526,9 +6406,8 @@ void mgmt_index_removed(struct hci_dev *hdev) } /* This function requires the caller holds hdev->lock */ -static void restart_le_actions(struct hci_request *req) +static void restart_le_actions(struct hci_dev *hdev) { - struct hci_dev *hdev = req->hdev; struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { @@ -7549,141 +6428,35 @@ static void restart_le_actions(struct hci_request *req) break; } } - - __hci_update_background_scan(req); } -static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode) +void mgmt_power_on(struct hci_dev *hdev, int err) { struct cmd_lookup match = { NULL, hdev }; - BT_DBG("status 0x%02x", status); - - if (!status) { - /* Register the available SMP channels (BR/EDR and LE) only - * when successfully powering on the controller. This late - * registration is required so that LE SMP can clearly - * decide if the public address or static address is used. - */ - smp_register(hdev); - } + BT_DBG("err %d", err); hci_dev_lock(hdev); + if (!err) { + restart_le_actions(hdev); + hci_update_background_scan(hdev); + } + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); new_settings(hdev, match.sk); - hci_dev_unlock(hdev); - if (match.sk) sock_put(match.sk); -} - -static int powered_update_hci(struct hci_dev *hdev) -{ - struct hci_request req; - struct adv_info *adv_instance; - u8 link_sec; - - hci_req_init(&req, hdev); - - if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) && - !lmp_host_ssp_capable(hdev)) { - u8 mode = 0x01; - - hci_req_add(&req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode); - - if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) { - u8 support = 0x01; - - hci_req_add(&req, HCI_OP_WRITE_SC_SUPPORT, - sizeof(support), &support); - } - } - - if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && - lmp_bredr_capable(hdev)) { - struct hci_cp_write_le_host_supported cp; - - cp.le = 0x01; - cp.simul = 0x00; - - /* Check first if we already have the right - * host state (host features set) - */ - if (cp.le != lmp_host_le_capable(hdev) || - cp.simul != lmp_host_le_br_capable(hdev)) - hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, - sizeof(cp), &cp); - } - - if (lmp_le_capable(hdev)) { - /* Make sure the controller has a good default for - * advertising data. This also applies to the case - * where BR/EDR was toggled during the AUTO_OFF phase. - */ - if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) && - (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) { - update_adv_data(&req); - update_scan_rsp_data(&req); - } - - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - hdev->cur_adv_instance == 0x00 && - !list_empty(&hdev->adv_instances)) { - adv_instance = list_first_entry(&hdev->adv_instances, - struct adv_info, list); - hdev->cur_adv_instance = adv_instance->instance; - } - - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - enable_advertising(&req); - else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) && - hdev->cur_adv_instance) - schedule_adv_instance(&req, hdev->cur_adv_instance, - true); - restart_le_actions(&req); - } - - link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); - if (link_sec != test_bit(HCI_AUTH, &hdev->flags)) - hci_req_add(&req, HCI_OP_WRITE_AUTH_ENABLE, - sizeof(link_sec), &link_sec); - - if (lmp_bredr_capable(hdev)) { - if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) - write_fast_connectable(&req, true); - else - write_fast_connectable(&req, false); - __hci_update_page_scan(&req); - update_class(&req); - update_name(&req); - update_eir(&req); - } - - return hci_req_run(&req, powered_complete); + hci_dev_unlock(hdev); } -int mgmt_powered(struct hci_dev *hdev, u8 powered) +void __mgmt_power_off(struct hci_dev *hdev) { struct cmd_lookup match = { NULL, hdev }; u8 status, zero_cod[] = { 0, 0, 0 }; - int err; - - if (!hci_dev_test_flag(hdev, HCI_MGMT)) - return 0; - - if (powered) { - if (powered_update_hci(hdev) == 0) - return 0; - - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, - &match); - goto new_settings; - } mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); @@ -7705,13 +6478,10 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered) mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, zero_cod, sizeof(zero_cod), NULL); -new_settings: - err = new_settings(hdev, match.sk); + new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); - - return err; } void mgmt_set_powered_failed(struct hci_dev *hdev, int err) @@ -7733,43 +6503,6 @@ void mgmt_set_powered_failed(struct hci_dev *hdev, int err) mgmt_pending_remove(cmd); } -void mgmt_discoverable_timeout(struct hci_dev *hdev) -{ - struct hci_request req; - - hci_dev_lock(hdev); - - /* When discoverable timeout triggers, then just make sure - * the limited discoverable flag is cleared. Even in the case - * of a timeout triggered from general discoverable, it is - * safe to unconditionally clear the flag. - */ - hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); - hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); - - hci_req_init(&req, hdev); - if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { - u8 scan = SCAN_PAGE; - hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, - sizeof(scan), &scan); - } - update_class(&req); - - /* Advertising instances don't use the global discoverable setting, so - * only update AD if advertising was enabled using Set Advertising. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - update_adv_data(&req); - - hci_req_run(&req, NULL); - - hdev->discov_timeout = 0; - - new_settings(hdev, NULL); - - hci_dev_unlock(hdev); -} - void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent) { @@ -8312,7 +7045,7 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status) if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(enable), &enable); - update_eir(&req); + __hci_req_update_eir(&req); } else { clear_eir(&req); } @@ -8452,7 +7185,7 @@ static void restart_le_scan(struct hci_dev *hdev) hdev->discovery.scan_duration)) return; - queue_delayed_work(hdev->workqueue, &hdev->le_scan_restart, + queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_restart, DISCOV_LE_RESTART_DELAY); } @@ -8527,6 +7260,18 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, return; } + if (hdev->discovery.limited) { + /* Check for limited discoverable bit */ + if (dev_class) { + if (!(dev_class[1] & 0x20)) + return; + } else { + u8 *flags = eir_get_data(eir, eir_len, EIR_FLAGS, NULL); + if (!flags || !(flags[0] & LE_AD_LIMITED)) + return; + } + } + /* Make sure that the buffer is big enough. The 5 extra bytes * are for the potential CoD field. */ @@ -8556,7 +7301,8 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, /* Copy EIR or advertising data into event */ memcpy(ev->eir, eir, eir_len); - if (dev_class && !eir_has_data_type(ev->eir, eir_len, EIR_CLASS_OF_DEV)) + if (dev_class && !eir_get_data(ev->eir, eir_len, EIR_CLASS_OF_DEV, + NULL)) eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV, dev_class, 3); @@ -8606,35 +7352,6 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering) mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL); } -static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - BT_DBG("%s status %u", hdev->name, status); -} - -void mgmt_reenable_advertising(struct hci_dev *hdev) -{ - struct hci_request req; - u8 instance; - - if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && - !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) - return; - - instance = get_current_adv_instance(hdev); - - hci_req_init(&req, hdev); - - if (instance) { - schedule_adv_instance(&req, instance, true); - } else { - update_adv_data(&req); - update_scan_rsp_data(&req); - enable_advertising(&req); - } - - hci_req_run(&req, adv_enable_complete); -} - static struct hci_mgmt_chan chan = { .channel = HCI_CHANNEL_CONTROL, .handler_count = ARRAY_SIZE(mgmt_handlers), diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 29709fbfd1f5..f7eb02f09b54 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -692,11 +692,9 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s) static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) { - struct rfcomm_session *s; - struct list_head *p, *n; + struct rfcomm_session *s, *n; struct l2cap_chan *chan; - list_for_each_safe(p, n, &session_list) { - s = list_entry(p, struct rfcomm_session, list); + list_for_each_entry_safe(s, n, &session_list, list) { chan = l2cap_pi(s->sock->sk)->chan; if ((!bacmp(src, BDADDR_ANY) || !bacmp(&chan->src, src)) && @@ -709,16 +707,14 @@ static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s, int err) { - struct rfcomm_dlc *d; - struct list_head *p, *n; + struct rfcomm_dlc *d, *n; s->state = BT_CLOSED; BT_DBG("session %p state %ld err %d", s, s->state, err); /* Close all dlcs */ - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); + list_for_each_entry_safe(d, n, &s->dlcs, list) { d->state = BT_CLOSED; __rfcomm_dlc_close(d, err); } @@ -1771,13 +1767,11 @@ static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s, static void rfcomm_process_connect(struct rfcomm_session *s) { - struct rfcomm_dlc *d; - struct list_head *p, *n; + struct rfcomm_dlc *d, *n; BT_DBG("session %p state %ld", s, s->state); - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); + list_for_each_entry_safe(d, n, &s->dlcs, list) { if (d->state == BT_CONFIG) { d->mtu = s->mtu; if (rfcomm_check_security(d)) { @@ -1843,14 +1837,11 @@ static int rfcomm_process_tx(struct rfcomm_dlc *d) static void rfcomm_process_dlcs(struct rfcomm_session *s) { - struct rfcomm_dlc *d; - struct list_head *p, *n; + struct rfcomm_dlc *d, *n; BT_DBG("session %p state %ld", s, s->state); - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); - + list_for_each_entry_safe(d, n, &s->dlcs, list) { if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) { __rfcomm_dlc_close(d, ETIMEDOUT); continue; @@ -1985,14 +1976,11 @@ static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s) static void rfcomm_process_sessions(void) { - struct list_head *p, *n; + struct rfcomm_session *s, *n; rfcomm_lock(); - list_for_each_safe(p, n, &session_list) { - struct rfcomm_session *s; - s = list_entry(p, struct rfcomm_session, list); - + list_for_each_entry_safe(s, n, &session_list, list) { if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) { s->state = BT_DISCONN; rfcomm_send_disc(s, 0); @@ -2075,15 +2063,12 @@ failed: static void rfcomm_kill_listener(void) { - struct rfcomm_session *s; - struct list_head *p, *n; + struct rfcomm_session *s, *n; BT_DBG(""); - list_for_each_safe(p, n, &session_list) { - s = list_entry(p, struct rfcomm_session, list); + list_for_each_entry_safe(s, n, &session_list, list) rfcomm_session_del(s); - } } static int rfcomm_run(void *unused) @@ -2113,8 +2098,7 @@ static int rfcomm_run(void *unused) static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; - struct rfcomm_dlc *d; - struct list_head *p, *n; + struct rfcomm_dlc *d, *n; BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt); @@ -2122,9 +2106,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) if (!s) return; - list_for_each_safe(p, n, &s->dlcs) { - d = list_entry(p, struct rfcomm_dlc, list); - + list_for_each_entry_safe(d, n, &s->dlcs, list) { if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) { rfcomm_dlc_clear_timer(d); if (status || encrypt == 0x00) { diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index fe129663bd3f..f52bcbf2e58c 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -526,6 +526,9 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, if (!addr || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + if (addr_len < sizeof(struct sockaddr_sco)) + return -EINVAL; + lock_sock(sk); if (sk->sk_state != BT_OPEN) { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index c91353841e40..ffed8a1d4f27 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3027,8 +3027,13 @@ static void smp_ready_cb(struct l2cap_chan *chan) BT_DBG("chan %p", chan); + /* No need to call l2cap_chan_hold() here since we already own + * the reference taken in smp_new_conn_cb(). This is just the + * first time that we tie it to a specific pointer. The code in + * l2cap_core.c ensures that there's no risk this function wont + * get called if smp_new_conn_cb was previously called. + */ conn->smp = chan; - l2cap_chan_hold(chan); if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5e88d3e17546..2c8095a5d824 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -28,6 +28,8 @@ const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); +static struct lock_class_key bridge_netdev_addr_lock_key; + /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -87,6 +89,11 @@ out: return NETDEV_TX_OK; } +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev) err = br_vlan_init(br); if (err) free_percpu(br->stats); + br_set_lockdep_class(dev); return err; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index a642bb829d09..82e3e9705017 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -135,6 +135,7 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) { struct switchdev_obj_port_fdb fdb = { .obj = { + .orig_dev = f->dst->dev, .id = SWITCHDEV_OBJ_ID_PORT_FDB, .flags = SWITCHDEV_F_DEFER, }, diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ec02f5869a78..c367b3e1b5ac 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -493,7 +493,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); if (err) goto err5; @@ -511,8 +511,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); - if (nbp_vlan_init(p)) + err = nbp_vlan_init(p); + if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); + goto err6; + } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -533,6 +536,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; +err6: + list_del_rcu(&p->list); + br_fdb_delete_by_port(br, p, 0, 1); + nbp_update_port_count(br); + netdev_upper_dev_unlink(dev, br->dev); + err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index cd8deea2d074..30e105f57f0d 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -7,6 +7,7 @@ #include <linux/if_ether.h> #include <net/ip.h> #include <net/netlink.h> +#include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/addrconf.h> @@ -210,10 +211,32 @@ static inline size_t rtnl_mdb_nlmsg_size(void) static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, int type) { + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_PORT_MDB, + .flags = SWITCHDEV_F_DEFER, + }, + .vid = entry->vid, + }; + struct net_device *port_dev; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; + port_dev = __dev_get_by_index(net, entry->ifindex); + if (entry->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(entry->addr.u.ip4, mdb.addr); +#if IS_ENABLED(CONFIG_IPV6) + else + ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr); +#endif + + mdb.obj.orig_dev = port_dev; + if (port_dev && type == RTM_NEWMDB) + switchdev_port_obj_add(port_dev, &mdb.obj); + else if (port_dev && type == RTM_DELMDB) + switchdev_port_obj_del(port_dev, &mdb.obj); + skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index f7e8dee64fc8..b3cca126b103 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -40,6 +40,7 @@ void br_log_state(const struct net_bridge_port *p) void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { + .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .flags = SWITCHDEV_F_DEFER, .u.stp_state = state, @@ -48,7 +49,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) p->state = state; err = switchdev_port_attr_set(p->dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); } @@ -570,6 +571,7 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) { struct switchdev_attr attr = { + .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.ageing_time = ageing_time, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index fa53d7a89f48..a31ac6ad76a2 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -37,9 +37,10 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) void br_init_port(struct net_bridge_port *p) { struct switchdev_attr attr = { + .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, - .u.ageing_time = p->br->ageing_time, + .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time), }; int err; @@ -50,7 +51,7 @@ void br_init_port(struct net_bridge_port *p) p->config_pending = 0; err = switchdev_port_attr_set(p->dev, &attr); - if (err) + if (err && err != -EOPNOTSUPP) netdev_err(p->dev, "failed to set HW ageing time\n"); } @@ -142,7 +143,10 @@ static void br_stp_start(struct net_bridge *br) char *envp[] = { NULL }; struct net_bridge_port *p; - r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + if (net_eq(dev_net(br->dev), &init_net)) + r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + else + r = -ENOENT; spin_lock_bh(&br->lock); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 8365bd53c421..6b8091407ca3 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -22,7 +22,6 @@ #include "br_private.h" -#define to_dev(obj) container_of(obj, struct device, kobj) #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* @@ -814,7 +813,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { - struct device *dev = to_dev(kobj); + struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 1394da63614a..85e43af4af7a 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -73,6 +73,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, u16 vid, u16 flags) { struct switchdev_obj_port_vlan v = { + .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid_begin = vid, @@ -120,6 +121,7 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, u16 vid) { struct switchdev_obj_port_vlan v = { + .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid_begin = vid, .vid_end = vid, @@ -624,9 +626,21 @@ void br_recalculate_fwd_mask(struct net_bridge *br) int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { + struct switchdev_attr attr = { + .orig_dev = br->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.vlan_filtering = val, + }; + int err; + if (br->vlan_enabled == val) return 0; + err = switchdev_port_attr_set(br->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + br->vlan_enabled = val; br_manage_promisc(br); recalculate_group_addr(br); @@ -637,13 +651,15 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { + int err; + if (!rtnl_trylock()) return restart_syscall(); - __br_vlan_filter_toggle(br, val); + err = __br_vlan_filter_toggle(br, val); rtnl_unlock(); - return 0; + return err; } int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) @@ -891,6 +907,12 @@ err_rhtbl: int nbp_vlan_init(struct net_bridge_port *p) { + struct switchdev_attr attr = { + .orig_dev = p->br->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, + .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, + .u.vlan_filtering = p->br->vlan_enabled, + }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; @@ -898,6 +920,10 @@ int nbp_vlan_init(struct net_bridge_port *p) if (!vg) goto out; + ret = switchdev_port_attr_set(p->dev, &attr); + if (ret && ret != -EOPNOTSUPP) + goto err_vlan_enabled; + ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; @@ -917,6 +943,7 @@ err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); rhashtable_destroy(&vg->vlan_hash); +err_vlan_enabled: err_rhtbl: kfree(vg); diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 17fd5f2cb4b8..98de6e7fd86d 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -65,8 +65,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) return false; - if (!(info->bitmask & ( EBT_IP6_DPORT | - EBT_IP6_SPORT | EBT_IP6_ICMP6))) + if (!(info->bitmask & (EBT_IP6_DPORT | + EBT_IP6_SPORT | EBT_IP6_ICMP6))) return true; /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 0ad639a96142..152300d164ac 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -36,14 +36,12 @@ static int ebt_log_tg_check(const struct xt_tgchk_param *par) return 0; } -struct tcpudphdr -{ +struct tcpudphdr { __be16 src; __be16 dst; }; -struct arppayload -{ +struct arppayload { unsigned char mac_src[ETH_ALEN]; unsigned char ip_src[4]; unsigned char mac_dst[ETH_ALEN]; @@ -152,7 +150,8 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, ntohs(ah->ar_op)); /* If it's for Ethernet and the lengths are OK, - * then log the ARP payload */ + * then log the ARP payload + */ if (ah->ar_hrd == htons(1) && ah->ar_hln == ETH_ALEN && ah->ar_pln == sizeof(__be32)) { diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 0c40570069ba..6b731e12ecfa 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -41,7 +41,7 @@ struct stp_config_pdu { #define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]) static bool ebt_filter_config(const struct ebt_stp_info *info, - const struct stp_config_pdu *stpc) + const struct stp_config_pdu *stpc) { const struct ebt_stp_config_info *c; uint16_t v16; diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index 618568888128..98c221dbf059 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -66,7 +66,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) * - Canonical Format Indicator (CFI). The Canonical Format Indicator * (CFI) is a single bit flag value. Currently ignored. * - VLAN Identifier (VID). The VID is encoded as - * an unsigned binary number. */ + * an unsigned binary number. + */ id = TCI & VLAN_VID_MASK; prio = (TCI >> 13) & 0x7; @@ -98,7 +99,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) } /* Check for bitmask range - * True if even one bit is out of mask */ + * True if even one bit is out of mask + */ if (info->bitmask & ~EBT_VLAN_MASK) { pr_debug("bitmask %2X is out of mask (%2X)\n", info->bitmask, EBT_VLAN_MASK); @@ -117,7 +119,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) * 0 - The null VLAN ID. * 1 - The default Port VID (PVID) * 0x0FFF - Reserved for implementation use. - * if_vlan.h: VLAN_N_VID 4096. */ + * if_vlan.h: VLAN_N_VID 4096. + */ if (GET_BITMASK(EBT_VLAN_ID)) { if (!!info->id) { /* if id!=0 => check vid range */ if (info->id > VLAN_N_VID) { @@ -128,7 +131,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) /* Note: This is valid VLAN-tagged frame point. * Any value of user_priority are acceptable, * but should be ignored according to 802.1Q Std. - * So we just drop the prio flag. */ + * So we just drop the prio flag. + */ info->bitmask &= ~EBT_VLAN_PRIO; } /* Else, id=0 (null VLAN ID) => user_priority range (any?) */ @@ -143,7 +147,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) } /* Check for encapsulated proto range - it is possible to be * any value for u_short range. - * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */ + * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS + */ if (GET_BITMASK(EBT_VLAN_ENCAP)) { if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) { pr_debug("encap frame length %d is less than " diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 32eccd101f26..593a1bdc079e 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -12,7 +12,7 @@ #include <linux/module.h> #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ - (1 << NF_BR_LOCAL_OUT)) + (1 << NF_BR_LOCAL_OUT)) static struct ebt_entries initial_chains[] = { { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index ec55358f00c8..eb33919821ee 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -12,7 +12,7 @@ #include <linux/module.h> #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ - (1 << NF_BR_POST_ROUTING)) + (1 << NF_BR_POST_ROUTING)) static struct ebt_entries initial_chains[] = { { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f46ca417bf2d..67b2e27999aa 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -35,8 +35,7 @@ "report to author: "format, ## args) /* #define BUGPRINT(format, args...) */ -/* - * Each cpu has its own set of counters, so there is no need for write_lock in +/* Each cpu has its own set of counters, so there is no need for write_lock in * the softirq * For reading or updating the counters, the user context needs to * get a write_lock @@ -46,7 +45,7 @@ #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) #define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter))) #define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \ - COUNTER_OFFSET(n) * cpu)) + COUNTER_OFFSET(n) * cpu)) @@ -126,7 +125,7 @@ ebt_dev_check(const char *entry, const struct net_device *device) /* process standard matches */ static inline int ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out) + const struct net_device *in, const struct net_device *out) { const struct ethhdr *h = eth_hdr(skb); const struct net_bridge_port *p; @@ -162,7 +161,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, for (i = 0; i < 6; i++) verdict |= (h->h_source[i] ^ e->sourcemac[i]) & e->sourcemsk[i]; - if (FWINV2(verdict != 0, EBT_ISOURCE) ) + if (FWINV2(verdict != 0, EBT_ISOURCE)) return 1; } if (e->bitmask & EBT_DESTMAC) { @@ -170,7 +169,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, for (i = 0; i < 6; i++) verdict |= (h->h_dest[i] ^ e->destmac[i]) & e->destmsk[i]; - if (FWINV2(verdict != 0, EBT_IDEST) ) + if (FWINV2(verdict != 0, EBT_IDEST)) return 1; } return 0; @@ -237,7 +236,8 @@ unsigned int ebt_do_table(struct sk_buff *skb, (*(counter_base + i)).bcnt += skb->len; /* these should only watch: not modify, nor tell us - what to do with the packet */ + * what to do with the packet + */ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); t = (struct ebt_entry_target *) @@ -323,7 +323,7 @@ letscontinue: /* If it succeeds, returns element and locks mutex */ static inline void * find_inlist_lock_noload(struct list_head *head, const char *name, int *error, - struct mutex *mutex) + struct mutex *mutex) { struct { struct list_head list; @@ -342,7 +342,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error, static void * find_inlist_lock(struct list_head *head, const char *name, const char *prefix, - int *error, struct mutex *mutex) + int *error, struct mutex *mutex) { return try_then_request_module( find_inlist_lock_noload(head, name, error, mutex), @@ -451,7 +451,8 @@ static int ebt_verify_pointers(const struct ebt_replace *repl, if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { if (e->bitmask != 0) { /* we make userspace set this right, - so there is no misunderstanding */ + * so there is no misunderstanding + */ BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set " "in distinguisher\n"); return -EINVAL; @@ -487,15 +488,14 @@ static int ebt_verify_pointers(const struct ebt_replace *repl, return 0; } -/* - * this one is very careful, as it is the first function +/* this one is very careful, as it is the first function * to parse the userspace data */ static inline int ebt_check_entry_size_and_hooks(const struct ebt_entry *e, - const struct ebt_table_info *newinfo, - unsigned int *n, unsigned int *cnt, - unsigned int *totalcnt, unsigned int *udc_cnt) + const struct ebt_table_info *newinfo, + unsigned int *n, unsigned int *cnt, + unsigned int *totalcnt, unsigned int *udc_cnt) { int i; @@ -504,10 +504,12 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e, break; } /* beginning of a new chain - if i == NF_BR_NUMHOOKS it must be a user defined chain */ + * if i == NF_BR_NUMHOOKS it must be a user defined chain + */ if (i != NF_BR_NUMHOOKS || !e->bitmask) { /* this checks if the previous chain has as many entries - as it said it has */ + * as it said it has + */ if (*n != *cnt) { BUGPRINT("nentries does not equal the nr of entries " "in the chain\n"); @@ -549,20 +551,18 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e, return 0; } -struct ebt_cl_stack -{ +struct ebt_cl_stack { struct ebt_chainstack cs; int from; unsigned int hookmask; }; -/* - * we need these positions to check that the jumps to a different part of the +/* We need these positions to check that the jumps to a different part of the * entries is a jump to the beginning of a new chain. */ static inline int ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, - unsigned int *n, struct ebt_cl_stack *udc) + unsigned int *n, struct ebt_cl_stack *udc) { int i; @@ -649,9 +649,9 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) static inline int ebt_check_entry(struct ebt_entry *e, struct net *net, - const struct ebt_table_info *newinfo, - const char *name, unsigned int *cnt, - struct ebt_cl_stack *cl_s, unsigned int udc_cnt) + const struct ebt_table_info *newinfo, + const char *name, unsigned int *cnt, + struct ebt_cl_stack *cl_s, unsigned int udc_cnt) { struct ebt_entry_target *t; struct xt_target *target; @@ -673,7 +673,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, BUGPRINT("Unknown flag for inv bitmask\n"); return -EINVAL; } - if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) { + if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) { BUGPRINT("NOPROTO & 802_3 not allowed\n"); return -EINVAL; } @@ -687,7 +687,8 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, break; } /* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on - a base chain */ + * a base chain + */ if (i < NF_BR_NUMHOOKS) hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); else { @@ -758,13 +759,12 @@ cleanup_matches: return ret; } -/* - * checks for loops and sets the hook mask for udc +/* checks for loops and sets the hook mask for udc * the hook mask for udc tells us from which base chains the udc can be * accessed. This mask is a parameter to the check() functions of the extensions */ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s, - unsigned int udc_cnt, unsigned int hooknr, char *base) + unsigned int udc_cnt, unsigned int hooknr, char *base) { int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict; const struct ebt_entry *e = (struct ebt_entry *)chain->data; @@ -853,7 +853,8 @@ static int translate_table(struct net *net, const char *name, return -EINVAL; } /* make sure chains are ordered after each other in same order - as their corresponding hooks */ + * as their corresponding hooks + */ for (j = i + 1; j < NF_BR_NUMHOOKS; j++) { if (!newinfo->hook_entry[j]) continue; @@ -868,7 +869,8 @@ static int translate_table(struct net *net, const char *name, i = 0; /* holds the expected nr. of entries for the chain */ j = 0; /* holds the up to now counted entries for the chain */ k = 0; /* holds the total nr. of entries, should equal - newinfo->nentries afterwards */ + * newinfo->nentries afterwards + */ udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */ ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, ebt_check_entry_size_and_hooks, newinfo, @@ -888,10 +890,12 @@ static int translate_table(struct net *net, const char *name, } /* get the location of the udc, put them in an array - while we're at it, allocate the chainstack */ + * while we're at it, allocate the chainstack + */ if (udc_cnt) { /* this will get free'd in do_replace()/ebt_register_table() - if an error occurs */ + * if an error occurs + */ newinfo->chainstack = vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack))); if (!newinfo->chainstack) @@ -932,14 +936,15 @@ static int translate_table(struct net *net, const char *name, } /* we now know the following (along with E=mc²): - - the nr of entries in each chain is right - - the size of the allocated space is right - - all valid hooks have a corresponding chain - - there are no loops - - wrong data can still be on the level of a single entry - - could be there are jumps to places that are not the - beginning of a chain. This can only occur in chains that - are not accessible from any base chains, so we don't care. */ + * - the nr of entries in each chain is right + * - the size of the allocated space is right + * - all valid hooks have a corresponding chain + * - there are no loops + * - wrong data can still be on the level of a single entry + * - could be there are jumps to places that are not the + * beginning of a chain. This can only occur in chains that + * are not accessible from any base chains, so we don't care. + */ /* used to know what we need to clean up if something goes wrong */ i = 0; @@ -955,7 +960,7 @@ static int translate_table(struct net *net, const char *name, /* called under write_lock */ static void get_counters(const struct ebt_counter *oldcounters, - struct ebt_counter *counters, unsigned int nentries) + struct ebt_counter *counters, unsigned int nentries) { int i, cpu; struct ebt_counter *counter_base; @@ -986,7 +991,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, struct ebt_table *t; /* the user wants counters back - the check on the size is done later, when we have the lock */ + * the check on the size is done later, when we have the lock + */ if (repl->num_counters) { unsigned long size = repl->num_counters * sizeof(*counterstmp); counterstmp = vmalloc(size); @@ -1038,9 +1044,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, write_unlock_bh(&t->lock); mutex_unlock(&ebt_mutex); /* so, a user can change the chains while having messed up her counter - allocation. Only reason why this is done is because this way the lock - is held only once, while this doesn't bring the kernel into a - dangerous state. */ + * allocation. Only reason why this is done is because this way the lock + * is held only once, while this doesn't bring the kernel into a + * dangerous state. + */ if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { @@ -1342,13 +1349,14 @@ static int update_counters(struct net *net, const void __user *user, } static inline int ebt_make_matchname(const struct ebt_entry_match *m, - const char *base, char __user *ubase) + const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)m - base); char name[EBT_FUNCTION_MAXNAMELEN] = {}; /* ebtables expects 32 bytes long names but xt_match names are 29 bytes - long. Copy 29 bytes and fill remaining bytes with zeroes. */ + * long. Copy 29 bytes and fill remaining bytes with zeroes. + */ strlcpy(name, m->u.match->name, sizeof(name)); if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; @@ -1356,19 +1364,19 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m, } static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, - const char *base, char __user *ubase) + const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)w - base); char name[EBT_FUNCTION_MAXNAMELEN] = {}; strlcpy(name, w->u.watcher->name, sizeof(name)); - if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN)) + if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } -static inline int -ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) +static inline int ebt_make_names(struct ebt_entry *e, const char *base, + char __user *ubase) { int ret; char __user *hlp; @@ -1394,9 +1402,9 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) } static int copy_counters_to_user(struct ebt_table *t, - const struct ebt_counter *oldcounters, - void __user *user, unsigned int num_counters, - unsigned int nentries) + const struct ebt_counter *oldcounters, + void __user *user, unsigned int num_counters, + unsigned int nentries) { struct ebt_counter *counterstmp; int ret = 0; @@ -1427,7 +1435,7 @@ static int copy_counters_to_user(struct ebt_table *t, /* called with ebt_mutex locked */ static int copy_everything_to_user(struct ebt_table *t, void __user *user, - const int *len, int cmd) + const int *len, int cmd) { struct ebt_replace tmp; const struct ebt_counter *oldcounters; @@ -1595,8 +1603,7 @@ static int ebt_compat_entry_padsize(void) static int ebt_compat_match_offset(const struct xt_match *match, unsigned int userlen) { - /* - * ebt_among needs special handling. The kernel .matchsize is + /* ebt_among needs special handling. The kernel .matchsize is * set to -1 at registration time; at runtime an EBT_ALIGN()ed * value is expected. * Example: userspace sends 4500, ebt_among.c wants 4504. @@ -1966,8 +1973,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, return off + match_size; } -/* - * return size of all matches, watchers or target, including necessary +/* return size of all matches, watchers or target, including necessary * alignment and padding. */ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, @@ -2070,8 +2076,7 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, if (ret < 0) return ret; buf_start = (char *) entry; - /* - * 0: matches offset, always follows ebt_entry. + /* 0: matches offset, always follows ebt_entry. * 1: watchers offset, from ebt_entry structure * 2: target offset, from ebt_entry structure * 3: next ebt_entry offset, from ebt_entry structure @@ -2115,8 +2120,7 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, return 0; } -/* - * repl->entries_size is the size of the ebt_entry blob in userspace. +/* repl->entries_size is the size of the ebt_entry blob in userspace. * It might need more memory when copied to a 64 bit kernel in case * userspace is 32-bit. So, first task: find out how much memory is needed. * @@ -2305,7 +2309,7 @@ static int compat_do_ebt_set_ctl(struct sock *sk, break; default: ret = -EINVAL; - } + } return ret; } @@ -2360,8 +2364,7 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd, break; case EBT_SO_GET_ENTRIES: case EBT_SO_GET_INIT_ENTRIES: - /* - * try real handler first in case of userland-side padding. + /* try real handler first in case of userland-side padding. * in case we are dealing with an 'ordinary' 32 bit binary * without 64bit compatibility padding, this will fail right * after copy_from_user when the *len argument is validated. diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 62f6b1b19589..7fcdd7261d88 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -141,7 +141,7 @@ err: static void nf_tables_bridge_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.bridge); + nft_unregister_afinfo(net, net->nft.bridge); kfree(net->nft.bridge); } diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index a21269b83f16..4b901d9f2e7c 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -84,6 +84,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_set_eval, .init = nft_meta_set_init, + .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, }; diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index cc858919108e..aa209b1066c9 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); @@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo) if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 10d87753ed87..9e43a315e662 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *ticket_buf = NULL; void *tp, *tpend; void **ptp; - struct ceph_timespec new_validity; struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; @@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); + ceph_decode_timespec(&validity, dp); + dp += sizeof(struct ceph_timespec); new_expires = get_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); dout(" expires=%lu renew_after=%lu\n", new_expires, @@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_buffer_put(th->ticket_blob); th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; - th->validity = new_validity; th->secret_id = new_secret_id; th->expires = new_expires; th->renew_after = new_renew_after; + th->have_key = true; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); @@ -384,6 +383,24 @@ bad: return -ERANGE; } +static bool need_key(struct ceph_x_ticket_handler *th) +{ + if (!th->have_key) + return true; + + return get_seconds() >= th->renew_after; +} + +static bool have_key(struct ceph_x_ticket_handler *th) +{ + if (th->have_key) { + if (get_seconds() >= th->expires) + th->have_key = false; + } + + return th->have_key; +} + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) { int want = ac->want_keys; @@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) continue; th = get_ticket_handler(ac, service); - if (IS_ERR(th)) { *pneed |= service; continue; } - if (get_seconds() >= th->renew_after) + if (need_key(th)) *pneed |= service; - if (get_seconds() >= th->expires) + if (!have_key(th)) xi->have_keys &= ~service; } } - static int ceph_x_build_request(struct ceph_auth_client *ac, void *buf, void *end) { @@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) ac->private = NULL; } -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) +static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) { struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - memset(&th->validity, 0, sizeof(th->validity)); + th->have_key = false; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + /* + * We are to invalidate a service ticket in the hopes of + * getting a new, hopefully more valid, one. But, we won't get + * it unless our AUTH ticket is good, so invalidate AUTH ticket + * as well, just in case. + */ + invalidate_ticket(ac, peer_type); + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); } static int calcu_signature(struct ceph_x_authorizer *au, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e8b7c6917d47..40b1a3cf7397 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -16,7 +16,7 @@ struct ceph_x_ticket_handler { unsigned int service; struct ceph_crypto_key session_key; - struct ceph_timespec validity; + bool have_key; u64 secret_id; struct ceph_buffer *ticket_blob; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9981039ef4ff..9cfedf565f5b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -23,9 +23,6 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> -#define list_entry_next(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con) } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); - cursor->page = list_entry_next(cursor->page, lru); + cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; @@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_entry_next(cursor->data, links); + cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) @@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -2506,13 +2528,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con) static void con_fault_finish(struct ceph_connection *con) { + dout("%s %p\n", __func__, con); + /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); + if (con->auth_retry) { + dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->ops->invalidate_authorizer) + con->ops->invalidate_authorizer(con); + con->auth_retry = 0; } if (con->ops->fault) @@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg) ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m) static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - LIST_HEAD(data); - struct list_head *links; - struct list_head *next; + struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_splice_init(&m->data, &data); - list_for_each_safe(links, next, &data) { - struct ceph_msg_data *data; - - data = list_entry(links, struct ceph_msg_data, links); - list_del_init(links); + list_for_each_entry_safe(data, next, &m->data, links) { + list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index edda01626a45..de85dddc3dc0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc) return monc->client->have_fsid && monc->auth->global_id > 0; } -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { diff --git a/net/core/Makefile b/net/core/Makefile index 086b01fbe1bd..0b835de04de3 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 617088aee21d..fa9dc6450b08 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn /* * Wait for the last received packet to be different from skb */ -static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, - const struct sk_buff *skb) +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); @@ -130,6 +130,7 @@ out_noerr: error = 1; goto out; } +EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { @@ -161,13 +162,15 @@ done: } /** - * __skb_recv_datagram - Receive a datagram skbuff + * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned + * @last: set to last peeked message to inform the wait function + * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and @@ -175,9 +178,11 @@ done: * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * - * This function will lock the socket if a skb is returned, so the caller - * needs to unlock the socket in that case (usually by calling - * skb_free_datagram) + * This function will lock the socket if a skb is returned, so + * the caller needs to unlock the socket in that case (usually by + * calling skb_free_datagram). Returns NULL with *err set to + * -EAGAIN if no data was available or to some other value if an + * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve @@ -191,13 +196,13 @@ done: * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ -struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, - int *peeked, int *off, int *err) +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err, + struct sk_buff **last) { struct sk_buff_head *queue = &sk->sk_receive_queue; - struct sk_buff *skb, *last; + struct sk_buff *skb; unsigned long cpu_flags; - long timeo; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ @@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, */ int _off = *off; - last = (struct sk_buff *)queue; + *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { - last = skb; + *last = skb; *peeked = skb->peeked; if (flags & MSG_PEEK) { if (_off >= skb->len && (skb->len || _off || @@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, skb = skb_set_peeked(skb); error = PTR_ERR(skb); - if (IS_ERR(skb)) - goto unlock_err; + if (IS_ERR(skb)) { + spin_unlock_irqrestore(&queue->lock, + cpu_flags); + goto no_packet; + } atomic_inc(&skb->users); } else @@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, *off = _off; return skb; } + spin_unlock_irqrestore(&queue->lock, cpu_flags); + } while (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)); - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)) - continue; + error = -EAGAIN; - /* User doesn't want to wait */ - error = -EAGAIN; - if (!timeo) - goto no_packet; +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_try_recv_datagram); - } while (!wait_for_more_packets(sk, err, &timeo, last)); +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, + int *peeked, int *off, int *err) +{ + struct sk_buff *skb, *last; + long timeo; - return NULL; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, + &last); + if (skb) + return skb; + + if (*err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, err, &timeo, last)); -unlock_err: - spin_unlock_irqrestore(&queue->lock, cpu_flags); -no_packet: - *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); @@ -785,7 +804,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/core/dev.c b/net/core/dev.c index ab9b8d0d115e..cc9e3652cf93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include <linux/skbuff.h> #include <net/net_namespace.h> #include <net/sock.h> +#include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dst.h> @@ -137,6 +138,7 @@ #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> +#include <linux/sctp.h> #include "net-sysfs.h" @@ -182,8 +184,8 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; -static DEFINE_HASHTABLE(napi_hash, 8); +static unsigned int napi_gen_id = NR_CPUS; +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -1674,6 +1676,22 @@ void net_dec_ingress_queue(void) EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif +#ifdef CONFIG_NET_EGRESS +static struct static_key egress_needed __read_mostly; + +void net_inc_egress_queue(void) +{ + static_key_slow_inc(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_inc_egress_queue); + +void net_dec_egress_queue(void) +{ + static_key_slow_dec(&egress_needed); +} +EXPORT_SYMBOL_GPL(net_dec_egress_queue); +#endif + static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL /* We are not allowed to call static_key_slow_dec() from irq context @@ -2403,17 +2421,20 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); @@ -2467,6 +2488,141 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +/* skb_csum_offload_check - Driver helper function to determine if a device + * with limited checksum offload capabilities is able to offload the checksum + * for a given packet. + * + * Arguments: + * skb - sk_buff for the packet in question + * spec - contains the description of what device can offload + * csum_encapped - returns true if the checksum being offloaded is + * encpasulated. That is it is checksum for the transport header + * in the inner headers. + * checksum_help - when set indicates that helper function should + * call skb_checksum_help if offload checks fail + * + * Returns: + * true: Packet has passed the checksum checks and should be offloadable to + * the device (a driver may still need to check for additional + * restrictions of its device) + * false: Checksum is not offloadable. If checksum_help was set then + * skb_checksum_help was called to resolve checksum for non-GSO + * packets and when IP protocol is not SCTP + */ +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + struct iphdr *iph; + struct ipv6hdr *ipv6; + void *nhdr; + int protocol; + u8 ip_proto; + + if (skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)) { + if (!spec->vlan_okay) + goto need_help; + } + + /* We check whether the checksum refers to a transport layer checksum in + * the outermost header or an encapsulated transport layer checksum that + * corresponds to the inner headers of the skb. If the checksum is for + * something else in the packet we need help. + */ + if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { + /* Non-encapsulated checksum */ + protocol = eproto_to_ipproto(vlan_get_protocol(skb)); + nhdr = skb_network_header(skb); + *csum_encapped = false; + if (spec->no_not_encapped) + goto need_help; + } else if (skb->encapsulation && spec->encap_okay && + skb_checksum_start_offset(skb) == + skb_inner_transport_offset(skb)) { + /* Encapsulated checksum */ + *csum_encapped = true; + switch (skb->inner_protocol_type) { + case ENCAP_TYPE_ETHER: + protocol = eproto_to_ipproto(skb->inner_protocol); + break; + case ENCAP_TYPE_IPPROTO: + protocol = skb->inner_protocol; + break; + } + nhdr = skb_inner_network_header(skb); + } else { + goto need_help; + } + + switch (protocol) { + case IPPROTO_IP: + if (!spec->ipv4_okay) + goto need_help; + iph = nhdr; + ip_proto = iph->protocol; + if (iph->ihl != 5 && !spec->ip_options_okay) + goto need_help; + break; + case IPPROTO_IPV6: + if (!spec->ipv6_okay) + goto need_help; + if (spec->no_encapped_ipv6 && *csum_encapped) + goto need_help; + ipv6 = nhdr; + nhdr += sizeof(*ipv6); + ip_proto = ipv6->nexthdr; + break; + default: + goto need_help; + } + +ip_proto_again: + switch (ip_proto) { + case IPPROTO_TCP: + if (!spec->tcp_okay || + skb->csum_offset != offsetof(struct tcphdr, check)) + goto need_help; + break; + case IPPROTO_UDP: + if (!spec->udp_okay || + skb->csum_offset != offsetof(struct udphdr, check)) + goto need_help; + break; + case IPPROTO_SCTP: + if (!spec->sctp_okay || + skb->csum_offset != offsetof(struct sctphdr, checksum)) + goto cant_help; + break; + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 *opthdr = nhdr; + + if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) + goto need_help; + + ip_proto = opthdr[0]; + nhdr += (opthdr[1] + 1) << 3; + + goto ip_proto_again; + } + default: + goto need_help; + } + + /* Passed the tests for offloading checksum */ + return true; + +need_help: + if (csum_help && !skb_shinfo(skb)->gso_size) + skb_checksum_help(skb); +cant_help: + return false; +} +EXPORT_SYMBOL(__skb_csum_offload_chk); + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -2539,6 +2695,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. + * + * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -2553,6 +2711,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); + SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; @@ -2641,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_CSUM_MASK; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -2788,7 +2949,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_ALL_CSUM) && + if (!(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) goto out_kfree_skb; } @@ -2867,7 +3028,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, bool contended; int rc; - qdisc_pkt_len_init(skb); qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a @@ -2925,7 +3085,8 @@ static void skb_update_prio(struct sk_buff *skb) struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); if (!skb->priority && skb->sk && map) { - unsigned int prioidx = skb->sk->sk_cgrp_prioidx; + unsigned int prioidx = + sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; @@ -2959,6 +3120,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dev_loopback_xmit); +#ifdef CONFIG_NET_EGRESS +static struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); + struct tcf_result cl_res; + + if (!cl) + return skb; + + /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set + * earlier by the caller. + */ + qdisc_bstats_cpu_update(cl->q, skb); + + switch (tc_classify(skb, cl, &cl_res, false)) { + case TC_ACT_OK: + case TC_ACT_RECLASSIFY: + skb->tc_index = TC_H_MIN(cl_res.classid); + break; + case TC_ACT_SHOT: + qdisc_qstats_cpu_drop(cl->q); + *ret = NET_XMIT_DROP; + goto drop; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *ret = NET_XMIT_SUCCESS; +drop: + kfree_skb(skb); + return NULL; + case TC_ACT_REDIRECT: + /* No need to push/pop skb's mac_header here on egress! */ + skb_do_redirect(skb); + *ret = NET_XMIT_SUCCESS; + return NULL; + default: + break; + } + + return skb; +} +#endif /* CONFIG_NET_EGRESS */ + static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -3018,7 +3222,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -3083,6 +3289,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_update_prio(skb); + qdisc_pkt_len_init(skb); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +# ifdef CONFIG_NET_EGRESS + if (static_key_false(&egress_needed)) { + skb = sch_handle_egress(skb, &rc, dev); + if (!skb) + goto out; + } +# endif +#endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ @@ -3104,9 +3321,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); -#endif trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); @@ -3663,9 +3877,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff *handle_ing(struct sk_buff *skb, - struct packet_type **pt_prev, - int *ret, struct net_device *orig_dev) +static inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); @@ -3859,7 +4073,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_key_false(&ingress_needed)) { - skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; @@ -4350,6 +4564,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4383,7 +4598,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } @@ -4658,7 +4876,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4669,43 +4887,101 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); -void napi_hash_add(struct napi_struct *napi) +#if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 +bool sk_busy_loop(struct sock *sk, int nonblock) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*busy_poll)(struct napi_struct *dev); + struct napi_struct *napi; + int rc = false; - spin_lock(&napi_hash_lock); + rcu_read_lock(); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; + + do { + rc = 0; + local_bh_disable(); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); } + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ - spin_unlock(&napi_hash_lock); - } + cpu_relax(); + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +void napi_hash_add(struct napi_struct *napi) +{ + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; + + spin_lock(&napi_hash_lock); + + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); @@ -4741,6 +5017,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4760,8 +5037,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -5348,7 +5629,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *private) + void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; @@ -5372,6 +5653,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, changeupper_info.upper_dev = upper_dev; changeupper_info.master = master; changeupper_info.linking = true; + changeupper_info.upper_info = upper_info; ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, &changeupper_info.info); @@ -5379,7 +5661,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; @@ -5417,8 +5699,12 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, - &changeupper_info.info); + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); + ret = notifier_to_errno(ret); + if (ret) + goto rollback_lower_mesh; + return 0; rollback_lower_mesh: @@ -5472,7 +5758,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -5480,6 +5766,8 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device + * @upper_priv: upper device private + * @upper_info: upper info to be passed down via notifier * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -5488,20 +5776,14 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + void *upper_priv, void *upper_info) { - return __netdev_upper_dev_link(dev, upper_dev, true, NULL); + return __netdev_upper_dev_link(dev, upper_dev, true, + upper_priv, upper_info); } EXPORT_SYMBOL(netdev_master_upper_dev_link); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private) -{ - return __netdev_upper_dev_link(dev, upper_dev, true, private); -} -EXPORT_SYMBOL(netdev_master_upper_dev_link_private); - /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -5660,7 +5942,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)) + bool (*type_check)(const struct net_device *dev)) { struct net_device *lower = NULL; struct list_head *iter; @@ -5682,6 +5964,26 @@ int dev_get_nest_level(struct net_device *dev, } EXPORT_SYMBOL(dev_get_nest_level); +/** + * netdev_lower_change - Dispatch event about lower device state change + * @lower_dev: device + * @lower_state_info: state to dispatch + * + * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info) +{ + struct netdev_notifier_changelowerstate_info changelowerstate_info; + + ASSERT_RTNL(); + changelowerstate_info.lower_state_info = lower_state_info; + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + &changelowerstate_info.info); +} +EXPORT_SYMBOL(netdev_lower_state_changed); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6372,9 +6674,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, /* UFO needs SG and checksumming */ if (features & NETIF_F_UFO) { /* maybe split UFO into V4 and V6? */ - if (!((features & NETIF_F_GEN_CSUM) || - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + if (!(features & NETIF_F_HW_CSUM) && + ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { netdev_dbg(dev, "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; @@ -6426,11 +6728,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ return -1; } @@ -7156,11 +7463,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); @@ -7469,16 +7778,16 @@ static int dev_cpu_callback(struct notifier_block *nfb, netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { - if (mask & NETIF_F_GEN_CSUM) - mask |= NETIF_F_ALL_CSUM; + if (mask & NETIF_F_HW_CSUM) + mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; - all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ - if (all & NETIF_F_GEN_CSUM) - all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + if (all & NETIF_F_HW_CSUM) + all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } diff --git a/net/core/dst.c b/net/core/dst.c index e6dc77252fe9..a1656e3b8d72 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -301,12 +301,13 @@ void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; + unsigned short nocache = dst->flags & DST_NOCACHE; newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(dst->flags & DST_NOCACHE)) + if (!newrefcnt && unlikely(nocache)) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 29edf74846fc..daf04709dd3c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -87,7 +87,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", - [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", [NETIF_F_RXHASH_BIT] = "rx-hashing", @@ -191,6 +191,23 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) return ret; } +static int phy_get_sset_count(struct phy_device *phydev) +{ + int ret; + + if (phydev->drv->get_sset_count && + phydev->drv->get_strings && + phydev->drv->get_stats) { + mutex_lock(&phydev->lock); + ret = phydev->drv->get_sset_count(phydev); + mutex_unlock(&phydev->lock); + + return ret; + } + + return -EOPNOTSUPP; +} + static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -204,6 +221,13 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); + if (sset == ETH_SS_PHY_STATS) { + if (dev->phydev) + return phy_get_sset_count(dev->phydev); + else + return -EOPNOTSUPP; + } + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -223,7 +247,17 @@ static void __ethtool_get_strings(struct net_device *dev, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); - else + else if (stringset == ETH_SS_PHY_STATS) { + struct phy_device *phydev = dev->phydev; + + if (phydev) { + mutex_lock(&phydev->lock); + phydev->drv->get_strings(phydev, data); + mutex_unlock(&phydev->lock); + } else { + return; + } + } else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } @@ -235,7 +269,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; @@ -1401,6 +1435,47 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + struct phy_device *phydev = dev->phydev; + u64 *data; + int ret, n_stats; + + if (!phydev) + return -EOPNOTSUPP; + + n_stats = phy_get_sset_count(phydev); + + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + mutex_lock(&phydev->lock); + phydev->drv->get_stats(phydev, &stats, data); + mutex_unlock(&phydev->lock); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; @@ -1779,6 +1854,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: + case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1991,6 +2067,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; + case ETHTOOL_GPHYSTATS: + rc = ethtool_get_phy_stats(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/filter.c b/net/core/filter.c index 672eefbfbe99..94d26201080d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -50,6 +50,7 @@ #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> +#include <net/sock_reuseport.h> /** * sk_filter - run a packet through a socket filter @@ -348,12 +349,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * jump offsets, 2nd pass remapping: * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); - * - * User BPF's register A is mapped to our BPF register 6, user BPF - * register X is mapped to BPF register 7; frame pointer is always - * register 10; Context 'void *ctx' is stored in register 1, that is, - * for socket filters: ctx == 'struct sk_buff *', for seccomp: - * ctx == 'struct seccomp_data *'. */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len) @@ -381,9 +376,22 @@ do_pass: new_insn = new_prog; fp = prog; - if (new_insn) - *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); - new_insn++; + /* Classic BPF related prologue emission. */ + if (new_insn) { + /* Classic BPF expects A and X to be reset first. These need + * to be guaranteed to be the first two instructions. + */ + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + + /* All programs must keep CTX in callee saved BPF_REG_CTX. + * In eBPF case it's done by the compiler, here we need to + * do this ourself. Initial CTX is present in BPF_REG_ARG1. + */ + *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + } else { + new_insn += 3; + } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[6] = { }; @@ -777,6 +785,11 @@ static int bpf_check_classic(const struct sock_filter *filter, if (ftest->k == 0) return -EINVAL; break; + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_K: + if (ftest->k >= 32) + return -EINVAL; + break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: @@ -1160,17 +1173,32 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -/** - * sk_attach_filter - attach a socket filter - * @fprog: the filter program - * @sk: the socket to use - * - * Attach the user's filter code. We first run some sanity checks on - * it to make sure it does not explode on us later. If an error - * occurs or there is insufficient memory for the filter a negative - * errno code is returned. On success the return is zero. - */ -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct bpf_prog *old_prog; + int err; + + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + return -ENOMEM; + + if (sk_unhashed(sk)) { + err = reuseport_alloc(sk); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + old_prog = reuseport_attach_prog(sk, prog); + if (old_prog) + bpf_prog_destroy(old_prog); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); unsigned int bpf_fsize = bpf_prog_size(fprog->len); @@ -1178,19 +1206,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) - return -EINVAL; + return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); - return -EFAULT; + return ERR_PTR(-EFAULT); } prog->len = fprog->len; @@ -1198,13 +1226,30 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog, NULL); + return bpf_prepare_filter(prog, NULL); +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1218,23 +1263,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); -int sk_attach_bpf(u32 ufd, struct sock *sk) +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct bpf_prog *prog; + struct bpf_prog *prog = __get_filter(fprog, sk); int err; + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); prog = bpf_prog_get(ufd); if (IS_ERR(prog)) - return PTR_ERR(prog); + return prog; if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); - return -EINVAL; + return ERR_PTR(-EINVAL); } + return prog; +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); @@ -1244,7 +1316,24 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } -#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + +#define BPF_LDST_LEN 16U static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { @@ -1252,9 +1341,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) int offset = (int) r2; void *from = (void *) (long) r3; unsigned int len = (unsigned int) r4; - char buf[16]; + char buf[BPF_LDST_LEN]; void *ptr; + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + return -EINVAL; + /* bpf verifier guarantees that: * 'from' pointer points to bpf program stack * 'len' bytes of it were initialized @@ -1274,7 +1366,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (unlikely(!ptr)) return -EFAULT; - if (BPF_RECOMPUTE_CSUM(flags)) + if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpull_rcsum(skb, ptr, len); memcpy(ptr, from, len); @@ -1283,8 +1375,9 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) /* skb_store_bits cannot return -EFAULT here */ skb_store_bits(skb, offset, ptr, len); - if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + if (flags & BPF_F_RECOMPUTE_CSUM) + skb_postpush_rcsum(skb, ptr, len); + return 0; } @@ -1299,8 +1392,35 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) -#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) +static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; + int offset = (int) r2; + void *to = (void *)(unsigned long) r3; + unsigned int len = (unsigned int) r4; + void *ptr; + + if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, to); + if (unlikely(!ptr)) + return -EFAULT; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +} + +const struct bpf_func_proto bpf_skb_load_bytes_proto = { + .func = bpf_skb_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { @@ -1308,6 +1428,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1319,7 +1441,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: csum_replace2(ptr, from, to); break; @@ -1351,10 +1473,12 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = flags & BPF_F_PSEUDO_HDR; int offset = (int) r2; __sum16 sum, *ptr; + if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; @@ -1366,7 +1490,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) if (unlikely(!ptr)) return -EFAULT; - switch (BPF_HEADER_FIELD_SIZE(flags)) { + switch (flags & BPF_F_HDR_FIELD_MASK) { case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; @@ -1395,13 +1519,14 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) - static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; struct net_device *dev; + if (unlikely(flags & ~(BPF_F_INGRESS))) + return -EINVAL; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -1410,8 +1535,12 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!skb2)) return -ENOMEM; - if (BPF_IS_REDIRECT_INGRESS(flags)) + if (flags & BPF_F_INGRESS) { + if (skb_at_tc_ingress(skb2)) + skb_postpush_rcsum(skb2, skb_mac_header(skb2), + skb2->mac_len); return dev_forward_skb(dev, skb2); + } skb2->dev = dev; skb_sender_cpu_clear(skb2); @@ -1433,12 +1562,17 @@ struct redirect_info { }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); + static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + if (unlikely(flags & ~(BPF_F_INGRESS))) + return TC_ACT_SHOT; + ri->ifindex = ifindex; ri->flags = flags; + return TC_ACT_REDIRECT; } @@ -1454,8 +1588,12 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - if (BPF_IS_REDIRECT_INGRESS(ri->flags)) + if (ri->flags & BPF_F_INGRESS) { + if (skb_at_tc_ingress(skb)) + skb_postpush_rcsum(skb, skb_mac_header(skb), + skb->mac_len); return dev_forward_skb(dev, skb); + } skb->dev = dev; skb_sender_cpu_clear(skb); @@ -1547,19 +1685,49 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static unsigned short bpf_tunnel_key_af(u64 flags) +{ + return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; +} + static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + u8 compat[sizeof(struct bpf_tunnel_key)]; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) - return -EINVAL; - if (ip_tunnel_info_af(info) != AF_INET) + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) return -EINVAL; + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) + return -EPROTO; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + to = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + to->tunnel_tos = info->key.tos; + to->tunnel_ttl = info->key.ttl; + + if (flags & BPF_F_TUNINFO_IPV6) + memcpy(to->remote_ipv6, &info->key.u.ipv6.src, + sizeof(to->remote_ipv6)); + else + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + + if (unlikely(size != sizeof(struct bpf_tunnel_key))) + memcpy((void *)(long) r2, to, size); return 0; } @@ -1581,10 +1749,25 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); + u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) return -EINVAL; + if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + switch (size) { + case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): + /* Fixup deprecated structure layouts here, so we have + * a common path later on. + */ + memcpy(compat, from, size); + memset(compat + size, 0, sizeof(compat) - size); + from = (struct bpf_tunnel_key *)compat; + break; + default: + return -EINVAL; + } + } skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1592,9 +1775,19 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.tos = from->tunnel_tos; + info->key.ttl = from->tunnel_ttl; + + if (flags & BPF_F_TUNINFO_IPV6) { + info->mode |= IP_TUNNEL_INFO_IPV6; + memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, + sizeof(from->remote_ipv6)); + } else { + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + } return 0; } @@ -1654,6 +1847,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1aa8437ed6c4..f18ae91b652e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -857,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh) struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) - skb = skb_copy(skb, GFP_ATOMIC); + skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); @@ -2215,7 +2215,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; - ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) @@ -2333,7 +2333,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + if (pneigh_net(n) != net) continue; if (idx < s_idx) goto next; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f88a62ab019d..b6c8a6629b39 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -471,6 +471,7 @@ static ssize_t phys_switch_id_show(struct device *dev, if (dev_isalive(netdev)) { struct switchdev_attr attr = { + .orig_dev = netdev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1452,8 +1453,8 @@ static void netdev_release(struct device *d) static const void *net_namespace(struct device *d) { - struct net_device *dev; - dev = container_of(d, struct net_device, dev); + struct net_device *dev = to_net_dev(d); + return dev_net(dev); } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index adef015b2f41..92da5e4ceb4f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -32,6 +32,10 @@ #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/fib.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <trace/events/fib6.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); +#endif EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..0260c84ed83c 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -56,29 +56,41 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css_cls_state(css)); } -static int update_classid(const void *v, struct file *file, unsigned n) +static int update_classid_sock(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_classid = (u32)(unsigned long)v; - + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void update_classid(struct cgroup_subsys_state *css, void *v) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; + struct css_task_iter it; struct task_struct *p; - cgroup_taskset_for_each(p, tset) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { task_lock(p); - iterate_fd(p->files, 0, update_classid, v); + iterate_fd(p->files, 0, update_classid_sock, v); task_unlock(p); } + css_task_iter_end(&it); +} + +static void cgrp_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + + cgroup_taskset_first(tset, &css); + update_classid(css, + (void *)(unsigned long)css_cls_state(css)->classid); } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -89,8 +101,13 @@ static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { - css_cls_state(css)->classid = (u32) value; + struct cgroup_cls_state *cs = css_cls_state(css); + + cgroup_sk_alloc_disable(); + + cs->classid = (u32)value; + update_classid(css, (void *)(unsigned long)cs->classid); return 0; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..f1efbc39ef6b 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,6 +27,12 @@ #include <linux/fdtable.h> +/* + * netprio allocates per-net_device priomap array which is indexed by + * css->id. Limiting css ID to 16bits doesn't lose anything. + */ +#define NETPRIO_ID_MAX USHRT_MAX + #define PRIOMAP_MIN_SZ 128 /* @@ -144,6 +150,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) struct net_device *dev; int ret = 0; + if (css->id > NETPRIO_ID_MAX) + return -ENOSPC; + if (!parent_css) return 0; @@ -200,6 +209,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; + cgroup_sk_alloc_disable(); + rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -213,18 +224,23 @@ static int update_netprio(const void *v, struct file *file, unsigned n) { int err; struct socket *sock = sock_from_file(file, &err); - if (sock) - sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + if (sock) { + spin_lock(&cgroup_sk_update_lock); + sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, + (unsigned long)v); + spin_unlock(&cgroup_sk_update_lock); + } return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index de8d5cc5eb24..1474cfd2dc1c 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2787,7 +2787,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, } else { skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + if (likely(skb)) + skb_reserve(skb, LL_RESERVED_SPACE(dev)); return skb; } @@ -2898,7 +2900,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V4_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; udp4_hwcsum(skb, iph->saddr, iph->daddr); @@ -3032,7 +3034,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, if (!(pkt_dev->flags & F_UDPCSUM)) { skb->ip_summed = CHECKSUM_NONE; - } else if (odev->features & NETIF_F_V6_CSUM) { + } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 504bd17b7456..d735e854f916 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1027,6 +1027,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct switchdev_attr attr = { + .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; @@ -1045,15 +1046,156 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, + struct net_device *dev) +{ + const struct rtnl_link_stats64 *stats; + struct rtnl_link_stats64 temp; + struct nlattr *attr; + + stats = dev_get_stats(dev, &temp); + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (!attr) + return -EMSGSIZE; + + copy_rtnl_link_stats64(nla_data(attr), stats); + + return 0; +} + +static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, + struct net_device *dev, + int vfs_num, + struct nlattr *vfinfo) +{ + struct ifla_vf_rss_query_en vf_rss_query_en; + struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_stats vf_stats; + struct ifla_vf_trust vf_trust; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_rate vf_rate; + struct nlattr *vf, *vfstats; + struct ifla_vf_mac vf_mac; + struct ifla_vf_info ivi; + + /* Not all SR-IOV capable drivers support the + * spoofcheck and "RSS query enable" query. Preset to + * -1 so the user space tool can detect that the driver + * didn't report anything. + */ + ivi.spoofchk = -1; + ivi.rss_query_en = -1; + ivi.trusted = -1; + memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; + if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) + return 0; + + vf_mac.vf = + vf_vlan.vf = + vf_rate.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = + vf_linkstate.vf = + vf_rss_query_en.vf = + vf_trust.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.max_tx_rate; + vf_rate.min_tx_rate = ivi.min_tx_rate; + vf_rate.max_tx_rate = ivi.max_tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; + vf_rss_query_en.setting = ivi.rss_query_en; + vf_trust.setting = ivi.trusted; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || + nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || + nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), + &vf_rate) || + nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate) || + nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate) || + nla_put(skb, IFLA_VF_RSS_QUERY_EN, + sizeof(vf_rss_query_en), + &vf_rss_query_en) || + nla_put(skb, IFLA_VF_TRUST, + sizeof(vf_trust), &vf_trust)) + return -EMSGSIZE; + memset(&vf_stats, 0, sizeof(vf_stats)); + if (dev->netdev_ops->ndo_get_vf_stats) + dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, + &vf_stats); + vfstats = nla_nest_start(skb, IFLA_VF_STATS); + if (!vfstats) { + nla_nest_cancel(skb, vf); + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; + } + if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets) || + nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes) || + nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast) || + nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast)) + return -EMSGSIZE; + nla_nest_end(skb, vfstats); + nla_nest_end(skb, vf); + return 0; +} + +static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) +{ + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct rtnl_link_stats64 temp; - const struct rtnl_link_stats64 *stats; - struct nlattr *attr, *af_spec; + struct nlattr *af_spec; struct rtnl_af_ops *af_ops; struct net_device *upper_dev = netdev_master_upper_dev_get(dev); @@ -1096,18 +1238,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; - if (1) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) - goto nla_put_failure; - } + if (rtnl_fill_link_ifmap(skb, dev)) + goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || @@ -1124,128 +1256,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); - if (attr == NULL) - goto nla_put_failure; - - stats = dev_get_stats(dev, &temp); - copy_rtnl_link_stats(nla_data(attr), stats); - - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); - if (attr == NULL) + if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - copy_rtnl_link_stats64(nla_data(attr), stats); if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent - && (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && + ext_filter_mask & RTEXT_FILTER_VF) { int i; - - struct nlattr *vfinfo, *vf, *vfstats; + struct nlattr *vfinfo; int num_vfs = dev_num_vf(dev->dev.parent); vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); if (!vfinfo) goto nla_put_failure; for (i = 0; i < num_vfs; i++) { - struct ifla_vf_info ivi; - struct ifla_vf_mac vf_mac; - struct ifla_vf_vlan vf_vlan; - struct ifla_vf_rate vf_rate; - struct ifla_vf_tx_rate vf_tx_rate; - struct ifla_vf_spoofchk vf_spoofchk; - struct ifla_vf_link_state vf_linkstate; - struct ifla_vf_rss_query_en vf_rss_query_en; - struct ifla_vf_stats vf_stats; - struct ifla_vf_trust vf_trust; - - /* - * Not all SR-IOV capable drivers support the - * spoofcheck and "RSS query enable" query. Preset to - * -1 so the user space tool can detect that the driver - * didn't report anything. - */ - ivi.spoofchk = -1; - ivi.rss_query_en = -1; - ivi.trusted = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); - /* The default value for VF link state is "auto" - * IFLA_VF_LINK_STATE_AUTO which equals zero - */ - ivi.linkstate = 0; - if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) - break; - vf_mac.vf = - vf_vlan.vf = - vf_rate.vf = - vf_tx_rate.vf = - vf_spoofchk.vf = - vf_linkstate.vf = - vf_rss_query_en.vf = - vf_trust.vf = ivi.vf; - - memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); - vf_vlan.vlan = ivi.vlan; - vf_vlan.qos = ivi.qos; - vf_tx_rate.rate = ivi.max_tx_rate; - vf_rate.min_tx_rate = ivi.min_tx_rate; - vf_rate.max_tx_rate = ivi.max_tx_rate; - vf_spoofchk.setting = ivi.spoofchk; - vf_linkstate.link_state = ivi.linkstate; - vf_rss_query_en.setting = ivi.rss_query_en; - vf_trust.setting = ivi.trusted; - vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || - nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || - nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), - &vf_rate) || - nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), - &vf_tx_rate) || - nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk) || - nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), - &vf_linkstate) || - nla_put(skb, IFLA_VF_RSS_QUERY_EN, - sizeof(vf_rss_query_en), - &vf_rss_query_en) || - nla_put(skb, IFLA_VF_TRUST, - sizeof(vf_trust), &vf_trust)) + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) goto nla_put_failure; - memset(&vf_stats, 0, sizeof(vf_stats)); - if (dev->netdev_ops->ndo_get_vf_stats) - dev->netdev_ops->ndo_get_vf_stats(dev, i, - &vf_stats); - vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - goto nla_put_failure; - } - if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast) || - nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast)) - goto nla_put_failure; - nla_nest_end(skb, vfstats); - nla_nest_end(skb, vf); } + nla_nest_end(skb, vfinfo); } @@ -2533,7 +2564,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, - int nlflags) + int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; @@ -2549,7 +2580,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb, ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = NUD_PERMANENT; + ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; @@ -2570,7 +2601,8 @@ static inline size_t rtnl_fdb_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); } -static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type) +static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, + u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2581,7 +2613,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, - 0, 0, type, NTF_SELF, 0); + 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; @@ -2716,7 +2748,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) nlh->nlmsg_flags); if (!err) { - rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2817,7 +2850,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { - rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH); + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -2845,7 +2879,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, - NLM_F_MULTI); + NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: @@ -3317,7 +3351,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); rtnl_doit_func doit; - int sz_idx, kind; + int kind; int family; int type; int err; @@ -3333,7 +3367,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; - sz_idx = type>>2; kind = type&3; if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) diff --git a/net/core/scm.c b/net/core/scm.c index 3b6899b7d810..14596fb37172 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -289,8 +289,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) /* Bump the usage count and install the file. */ sock = sock_from_file(fp[i], &err); if (sock) { - sock_update_netprioidx(sock->sk); - sock_update_classid(sock->sk); + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(fp[i])); } @@ -305,6 +305,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_SPACE(i*sizeof(int)); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index aa41e6dd6429..b2df375ec9c2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3643,7 +3643,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP) + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) serr->ee.ee_data -= sk->sk_tskey; } @@ -4268,7 +4269,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) return NULL; } - memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN, + 2 * ETH_ALEN); skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index 1e4dd54bfb5a..6c1c8bc93412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,7 @@ #include <linux/sock_diag.h> #include <linux/filter.h> +#include <net/sock_reuseport.h> #include <trace/events/sock.h> @@ -194,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -239,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -433,8 +391,6 @@ static bool sock_needs_netstamp(const struct sock *sk) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) - static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { @@ -874,7 +830,8 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) { if (sk->sk_state != TCP_ESTABLISHED) { ret = -EINVAL; break; @@ -933,6 +890,32 @@ set_rcvbuf: } break; + case SO_ATTACH_REUSEPORT_CBPF: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_reuseport_attach_filter(&fprog, sk); + } + break; + + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1363,6 +1346,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); + cgroup_sk_alloc(&sk->sk_cgrp_data); } return sk; @@ -1385,6 +1369,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) owner = prot->owner; slab = prot->slab; + cgroup_sk_free(&sk->sk_cgrp_data); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1393,17 +1378,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) -void sock_update_netprioidx(struct sock *sk) -{ - if (in_interrupt()) - return; - - sk->sk_cgrp_prioidx = task_netprioidx(current); -} -EXPORT_SYMBOL_GPL(sock_update_netprioidx); -#endif - /** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @@ -1432,8 +1406,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_classid(&sk->sk_cgrp_data); + sock_update_netprioidx(&sk->sk_cgrp_data); } return sk; @@ -1453,6 +1427,8 @@ void sk_destruct(struct sock *sk) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); @@ -1488,12 +1464,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1530,7 +1500,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1553,7 +1522,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; @@ -1589,7 +1558,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -1607,7 +1577,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - __sk_dst_set(sk, dst); + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -1815,7 +1785,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); for (;;) { if (!timeo) break; @@ -1861,7 +1831,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) break; - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; if (!timeo) @@ -2048,9 +2018,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); return rc; } @@ -2071,27 +2041,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2140,6 +2110,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2155,6 +2128,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); @@ -2283,7 +2259,7 @@ static void sock_def_wakeup(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } @@ -2294,7 +2270,7 @@ static void sock_def_error_report(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); @@ -2306,7 +2282,7 @@ static void sock_def_readable(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -2324,7 +2300,7 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); @@ -2388,7 +2364,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 0c1d58d43f67..a996ce8c8fb2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; struct sock_diag_req *req = nlmsg_data(nlh); @@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; - else + else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); + else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) + err = hndl->destroy(skb, nlh); + else + err = -EOPNOTSUPP; mutex_unlock(&sock_diag_table_mutex); return err; @@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return ret; case SOCK_DIAG_BY_FAMILY: - return __sock_diag_rcv_msg(skb, nlh); + case SOCK_DESTROY: + return __sock_diag_cmd(skb, nlh); default: return -EINVAL; } @@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group) return 0; } +int sock_diag_destroy(struct sock *sk, int err) +{ + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!sk->sk_prot->diag_destroy) + return -EOPNOTSUPP; + + return sk->sk_prot->diag_destroy(sk, err); +} +EXPORT_SYMBOL_GPL(sock_diag_destroy); + static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..1df98c557440 --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,251 @@ +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +static DEFINE_SPINLOCK(reuseport_lock); + +static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +{ + size_t size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk) +{ + struct sock_reuseport *reuse; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "multiple allocations for the same socket"); + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + + reuse->socks[0] = sk; + reuse->num_socks = 1; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) + return NULL; + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->max_socks = more_socks_size; + more_reuse->num_socks = reuse->num_socks; + more_reuse->prog = reuse->prog; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + + for (i = 0; i < reuse->num_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "socket already in reuseport group"); + + if (reuse->num_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + int i; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + for (i = 0; i < reuse->num_socks; i++) { + if (reuse->socks[i] == sk) { + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + if (reuse->num_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + break; + } + } + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + kfree_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in reuseport_add_sock() */ + smp_rmb(); + + if (prog && skb) + sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + else + sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +struct bpf_prog * +reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + return old_prog; +} +EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/core/stream.c b/net/core/stream.c index d70f77a0c889..159516a11b7e 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -35,11 +35,11 @@ void sk_stream_write_space(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } @@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; while (1) { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) } if (signal_pending(current)) goto do_interrupted; - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index db5fc2440a23..9c6d0508e63a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -202,7 +202,9 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req security_req_classify_flow(req, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -219,7 +221,10 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; - err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -387,6 +392,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; @@ -453,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * comment in that function for the gory details. -acme */ - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; @@ -488,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - if (np->opt != NULL) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt != NULL) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); @@ -757,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -856,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -873,12 +883,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; - if (np->opt != NULL) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; diff --git a/net/dccp/output.c b/net/dccp/output.c index 4ce912e691d0..b66c84db0766 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -201,7 +201,7 @@ void dccp_write_space(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b5cf13a28009..41e65804ddf5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 675cf94e04f8..13d6b1a6e0fc 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; @@ -1747,9 +1750,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); } @@ -2004,10 +2007,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); sk_wait_event(sk, &timeo, !dn_queue_too_long(scp, queue, flags)); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); finish_wait(sk_sleep(sk), &wait); continue; } diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 4677b6fa6dda..ecc28cff08ab 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -67,7 +67,7 @@ * Returns the size of the result on success, -ve error code otherwise. */ int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry) + const char *options, char **_result, time64_t *_expiry) { struct key *rkey; const struct user_key_payload *upayload; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1eba07feb34a..fa4daba8db55 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -21,8 +21,10 @@ #include <linux/of_mdio.h> #include <linux/of_platform.h> #include <linux/of_net.h> +#include <linux/of_gpio.h> #include <linux/sysfs.h> #include <linux/phy_fixed.h> +#include <linux/gpio/consumer.h> #include "dsa_priv.h" char dsa_driver_version[] = "0.1"; @@ -437,7 +439,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (of_phy_is_fixed_link(port_dn)) { phydev = of_phy_find_device(port_dn); if (phydev) { - int addr = phydev->addr; + int addr = phydev->mdio.addr; phy_device_free(phydev); of_node_put(port_dn); @@ -454,8 +456,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!ds->ports[port]) continue; - unregister_netdev(ds->ports[port]); - free_netdev(ds->ports[port]); + dsa_slave_destroy(ds->ports[port]); } mdiobus_unregister(ds->slave_mii_bus); @@ -506,33 +507,6 @@ static int dsa_switch_resume(struct dsa_switch *ds) } #endif - -/* link polling *************************************************************/ -static void dsa_link_poll_work(struct work_struct *ugly) -{ - struct dsa_switch_tree *dst; - int i; - - dst = container_of(ugly, struct dsa_switch_tree, link_poll_work); - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; - - if (ds != NULL && ds->drv->poll_link != NULL) - ds->drv->poll_link(ds); - } - - mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ)); -} - -static void dsa_link_poll_timer(unsigned long _dst) -{ - struct dsa_switch_tree *dst = (void *)_dst; - - schedule_work(&dst->link_poll_work); -} - - /* platform driver init and cleanup *****************************************/ static int dev_is_class(struct device *dev, void *class) { @@ -688,6 +662,9 @@ static int dsa_of_probe(struct device *dev) const char *port_name; int chip_index, port_index; const unsigned int *sw_addr, *port_reg; + int gpio; + enum of_gpio_flags of_flags; + unsigned long flags; u32 eeprom_len; int ret; @@ -766,6 +743,19 @@ static int dsa_of_probe(struct device *dev) put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } + gpio = of_get_named_gpio_flags(child, "reset-gpios", 0, + &of_flags); + if (gpio_is_valid(gpio)) { + flags = (of_flags == OF_GPIO_ACTIVE_LOW ? + GPIOF_ACTIVE_LOW : 0); + ret = devm_gpio_request_one(dev, gpio, flags, + "switch_reset"); + if (ret) + goto out_free_chip; + + cd->reset = gpio_to_desc(gpio); + gpiod_direction_output(cd->reset, 0); + } for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); @@ -859,8 +849,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, } dst->ds[i] = ds; - if (ds->drv->poll_link != NULL) - dst->link_poll_needed = 1; ++configured; } @@ -879,15 +867,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = (void *)dst; - if (dst->link_poll_needed) { - INIT_WORK(&dst->link_poll_work, dsa_link_poll_work); - init_timer(&dst->link_poll_timer); - dst->link_poll_timer.data = (unsigned long)dst; - dst->link_poll_timer.function = dsa_link_poll_timer; - dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); - add_timer(&dst->link_poll_timer); - } - return 0; } @@ -939,8 +918,10 @@ static int dsa_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dst); ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); - if (ret) + if (ret) { + dev_put(dev); goto out; + } return 0; @@ -954,17 +935,14 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - if (dst->link_poll_needed) - del_timer_sync(&dst->link_poll_timer); - - flush_work(&dst->link_poll_work); - for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; if (ds) dsa_switch_destroy(ds); } + + dev_put(dst->master_netdev); } static int dsa_remove(struct platform_device *pdev) @@ -1010,6 +988,14 @@ static int dsa_suspend(struct device *d) struct dsa_switch_tree *dst = platform_get_drvdata(pdev); int i, ret = 0; + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 311796c809af..1d1a54687e4a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -61,6 +61,7 @@ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name); +void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_netdevice_event(struct notifier_block *unused, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 7bc787b095c8..40b9ca72aae3 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -15,6 +15,7 @@ #include <linux/phy_fixed.h> #include <linux/of_net.h> #include <linux/of_mdio.h> +#include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/switchdev.h> #include <linux/if_bridge.h> @@ -997,7 +998,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, { struct dsa_switch *ds = p->parent; - p->phy = ds->slave_mii_bus->phy_map[addr]; + p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); if (!p->phy) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; @@ -1080,11 +1081,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret); return ret; } - } else { - netdev_info(slave_dev, "attached PHY at address %d [%s]\n", - p->phy->addr, p->phy->drv->name); } + phy_attached_info(p->phy); + return 0; } @@ -1189,13 +1189,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, p->old_link = -1; p->old_duplex = -1; - ret = dsa_slave_phy_setup(p, slave_dev); - if (ret) { - netdev_err(master, "error %d setting up slave phy\n", ret); - free_netdev(slave_dev); - return ret; - } - ds->ports[port] = slave_dev; ret = register_netdev(slave_dev); if (ret) { @@ -1209,9 +1202,27 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, netif_carrier_off(slave_dev); + ret = dsa_slave_phy_setup(p, slave_dev); + if (ret) { + netdev_err(master, "error %d setting up slave phy\n", ret); + free_netdev(slave_dev); + return ret; + } + return 0; } +void dsa_slave_destroy(struct net_device *slave_dev) +{ + struct dsa_slave_priv *p = netdev_priv(slave_dev); + + netif_carrier_off(slave_dev); + if (p->phy) + phy_disconnect(p->phy); + unregister_netdev(slave_dev); + free_netdev(slave_dev); +} + static bool dsa_slave_dev_check(struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9e63f252a89e..103871784e50 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -52,6 +52,8 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/if_ether.h> +#include <linux/of_net.h> +#include <linux/pci.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> @@ -485,3 +487,32 @@ static int __init eth_offload_init(void) } fs_initcall(eth_offload_init); + +unsigned char * __weak arch_get_platform_mac_address(void) +{ + return NULL; +} + +int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) +{ + const unsigned char *addr; + struct device_node *dp; + + if (dev_is_pci(dev)) + dp = pci_device_to_OF_node(to_pci_dev(dev)); + else + dp = dev->of_node; + + addr = NULL; + if (dp) + addr = of_get_mac_address(dp); + if (!addr) + addr = arch_get_platform_mac_address(); + + if (!addr) + return -ENODEV; + + ether_addr_copy(mac_addr, addr); + return 0; +} +EXPORT_SYMBOL(eth_platform_get_mac_address); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 35a9788bb3ae..c7d1adca30d8 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -312,7 +312,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) return; out: - WARN_ON_ONCE("HSR: Could not send supervision frame\n"); + WARN_ONCE(1, "HSR: Could not send supervision frame\n"); kfree_skb(skb); } diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 20c49c724ba0..737c87a2a41e 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -161,9 +161,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; - lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154); - - ret = register_netdevice(ldev); + ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); return ret; @@ -180,7 +178,7 @@ static void lowpan_dellink(struct net_device *ldev, struct list_head *head) ASSERT_RTNL(); wdev->ieee802154_ptr->lowpan_dev = NULL; - unregister_netdevice(ldev); + lowpan_unregister_netdevice(ldev); dev_put(wdev); } diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6b437e8760d3..30d875dff6b5 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -624,7 +624,6 @@ int __init lowpan_net_frag_init(void) lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; - lowpan_frags.skb_free = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 416dfa004cfb..c22920525e5d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -436,6 +436,19 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config INET_DIAG_DESTROY + bool "INET: allow privileged process to administratively close sockets" + depends on INET_DIAG + default n + ---help--- + Provides a SOCK_DESTROY operation that allows privileged processes + (e.g., a connection manager or a network administration tool such as + ss) to close sockets opened by other processes. Closing a socket in + this way interrupts any blocking read/write/connect operations on + the socket and causes future socket calls to behave as if the socket + had been disconnected. + If unsure, say N. + menuconfig TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c29809f765dc..62c049b647e9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 11c4ca13ec3b..5c5db6636704 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cc8f3e506cde..473447593060 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1155,6 +1155,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info; struct in_device *in_dev; struct net *net = dev_net(dev); unsigned int flags; @@ -1193,6 +1194,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo case NETDEV_CHANGEMTU: rt_cache_flush(net); break; + case NETDEV_CHANGEUPPER: + info = ptr; + /* flush all routes if dev is linked to or unlinked from + * an L3 master device (e.g., VRF) + */ + if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + fib_disable_ip(dev, NETDEV_DOWN, true); + break; } return NOTIFY_DONE; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..7aea0ccb6be6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->tn_bits <= TNODE_KMALLOC_MAX) - kfree(n); else - vfree(n); + kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index e0fcbbbcfe54..976f0dcf6991 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -24,6 +24,7 @@ struct fou { u16 type; struct udp_offload udp_offloads; struct list_head list; + struct rcu_head rcu; }; #define FOU_F_REMCSUM_NOPARTIAL BIT(0) @@ -417,7 +418,7 @@ static void fou_release(struct fou *fou) list_del(&fou->list); udp_tunnel_sock_release(sock); - kfree(fou); + kfree_rcu(fou, rcu); } static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) @@ -497,7 +498,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; if (cfg->udp_config.family == AF_INET) { - err = udp_add_offload(&fou->udp_offloads); + err = udp_add_offload(net, &fou->udp_offloads); if (err) goto error; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6baf36e11808..05e4cba14162 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2126,7 +2126,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); - if (!in_dev) { + if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } @@ -2147,7 +2147,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; - ip_mc_dec_group(in_dev, group); + if (in_dev) + ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1feb15f23de8..46b9c887bede 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -563,7 +563,7 @@ static void reqsk_timer_handler(unsigned long data) int max_retries, thresh; u8 defer_accept; - if (sk_listener->sk_state != TCP_LISTEN) + if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; @@ -749,7 +749,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk->sk_state = TCP_LISTEN; + sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ab9f8a66615d..8bb8e7ad8548 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -350,17 +350,12 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, nlmsg_flags, unlh); } -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct sk_buff *in_skb, - const struct nlmsghdr *nlh, - const struct inet_diag_req_v2 *req) +struct sock *inet_diag_find_one_icsk(struct net *net, + struct inet_hashinfo *hashinfo, + const struct inet_diag_req_v2 *req) { - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; struct sock *sk; - int err; - err = -EINVAL; if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], @@ -375,15 +370,33 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, req->id.idiag_if); #endif else - goto out_nosk; + return ERR_PTR(-EINVAL); - err = -ENOENT; if (!sk) - goto out_nosk; + return ERR_PTR(-ENOENT); - err = sock_diag_check_cookie(sk, req->id.idiag_cookie); - if (err) - goto out; + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; +} +EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); + +int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, + struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; + + sk = inet_diag_find_one_icsk(net, hashinfo, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { @@ -409,12 +422,11 @@ out: if (sk) sock_gen_put(sk); -out_nosk: return err; } EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); -static int inet_diag_get_exact(struct sk_buff *in_skb, +static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, const struct inet_diag_req_v2 *req) { @@ -424,8 +436,12 @@ static int inet_diag_get_exact(struct sk_buff *in_skb, handler = inet_diag_lock_handler(req->sdiag_protocol); if (IS_ERR(handler)) err = PTR_ERR(handler); - else + else if (cmd == SOCK_DIAG_BY_FAMILY) err = handler->dump_one(in_skb, nlh, req); + else if (cmd == SOCK_DESTROY && handler->destroy) + err = handler->destroy(in_skb, req); + else + err = -EOPNOTSUPP; inet_diag_unlock_handler(handler); return err; @@ -938,7 +954,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_get_exact(in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -972,7 +988,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) return inet_diag_get_exact_compat(skb, nlh); } -static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); struct net *net = sock_net(skb->sk); @@ -980,7 +996,8 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) if (nlmsg_len(h) < hdrlen) return -EINVAL; - if (h->nlmsg_flags & NLM_F_DUMP) { + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; @@ -999,7 +1016,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } } - return inet_diag_get_exact(skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); } static @@ -1050,14 +1067,16 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) static const struct sock_diag_handler inet_diag_handler = { .family = AF_INET, - .dump = inet_diag_handler_dump, + .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, + .destroy = inet_diag_handler_cmd, }; static const struct sock_diag_handler inet6_diag_handler = { .family = AF_INET6, - .dump = inet_diag_handler_dump, + .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, + .destroy = inet_diag_handler_cmd, }; int inet_diag_register(const struct inet_diag_handler *h) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index fe144dae7372..3a88b0c73797 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -285,14 +285,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) } EXPORT_SYMBOL(inet_frag_kill); -static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, - struct sk_buff *skb) -{ - if (f->skb_free) - f->skb_free(skb); - kfree_skb(skb); -} - void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) { struct sk_buff *fp; @@ -309,7 +301,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) struct sk_buff *xp = fp->next; sum_truesize += fp->truesize; - frag_kfree_skb(nf, f, fp); + kfree_skb(fp); fp = xp; } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1fe55ae81781..3f00810b7288 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -891,7 +891,6 @@ void __init ipfrag_init(void) ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; - ip4_frags.skb_free = NULL; ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 614521437e30..7c51c4e1661f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -24,7 +24,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/in6.h> @@ -562,10 +561,9 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + + iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); return; err_free_rt: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4233cbe47052..64878efa045c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -76,7 +76,6 @@ #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> -#include <linux/mroute.h> #include <linux/netlink.h> #include <linux/tcp.h> @@ -240,6 +239,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * from host network stack. */ features = netif_skb_features(skb); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); @@ -912,7 +912,7 @@ static int __ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->dst.dev->features & NETIF_F_V4_CSUM && + rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && !(flags & MSG_MORE) && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -921,7 +921,7 @@ static int __ip_append_data(struct sock *sk, if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && - (sk->sk_type == SOCK_DGRAM)) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cbb51f3fac06..c7bd72e9b544 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -30,7 +30,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> @@ -657,7 +656,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, struct rtable *rt; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; - int err; bool connected; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); @@ -795,10 +793,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, return; } - err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, - tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); - + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev))); return; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6cb9009c3d96..859d415c0b2d 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -24,7 +24,6 @@ #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> @@ -48,12 +47,13 @@ #include <net/rtnetlink.h> #include <net/dst_metadata.h> -int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet) +void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); + struct net_device *dev = skb->dev; struct iphdr *iph; int err; @@ -82,7 +82,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, err = ip_local_out(net, sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; - return pkt_len; + iptunnel_xmit_stats(dev, pkt_len); } EXPORT_SYMBOL_GPL(iptunnel_xmit); @@ -251,7 +251,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info = lwt_tun_info(new_state); if (tb[LWTUNNEL_IP_ID]) - tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); + tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); @@ -266,7 +266,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); + tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = 0; @@ -281,12 +281,12 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); - if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || + if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; return 0; @@ -346,7 +346,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info = lwt_tun_info(new_state); if (tb[LWTUNNEL_IP6_ID]) - tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]); + tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]); if (tb[LWTUNNEL_IP6_DST]) tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); @@ -361,7 +361,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); + tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = 0; @@ -376,12 +376,12 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); - if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; return 0; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 4d8f0b698777..5cf10b777b7e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -30,7 +30,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> @@ -200,7 +199,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = skb->len; - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 0bc7412d9e14..67f7c9de0b16 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -65,15 +65,6 @@ #include <net/checksum.h> #include <asm/processor.h> -/* Define this to allow debugging output */ -#undef IPCONFIG_DEBUG - -#ifdef IPCONFIG_DEBUG -#define DBG(x) printk x -#else -#define DBG(x) do { } while(0) -#endif - #if defined(CONFIG_IP_PNP_DHCP) #define IPCONFIG_DHCP #endif @@ -227,7 +218,7 @@ static int __init ic_open_devs(void) if (dev->mtu >= 364) able |= IC_BOOTP; else - pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small", + pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small\n", dev->name, dev->mtu); if (!(dev->flags & IFF_NOARP)) able |= IC_RARP; @@ -254,8 +245,8 @@ static int __init ic_open_devs(void) else d->xid = 0; ic_proto_have_if |= able; - DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n", - dev->name, able, d->xid)); + pr_debug("IP-Config: %s UP (able=%d, xid=%08x)\n", + dev->name, able, d->xid); } } @@ -311,7 +302,7 @@ static void __init ic_close_devs(void) next = d->next; dev = d->dev; if (dev != ic_dev && !netdev_uses_dsa(dev)) { - DBG(("IP-Config: Downing %s\n", dev->name)); + pr_debug("IP-Config: Downing %s\n", dev->name); dev_change_flags(dev, d->flags); } kfree(d); @@ -464,7 +455,8 @@ static int __init ic_defaults(void) &ic_myaddr); return -1; } - printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); + pr_notice("IP-Config: Guessing netmask %pI4\n", + &ic_netmask); } return 0; @@ -675,9 +667,7 @@ ic_dhcp_init_options(u8 *options) u8 *e = options; int len; -#ifdef IPCONFIG_DEBUG - printk("DHCP: Sending message type %d\n", mt); -#endif + pr_debug("DHCP: Sending message type %d\n", mt); memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ e += 4; @@ -847,7 +837,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d else if (dev->type == ARPHRD_FDDI) b->htype = ARPHRD_ETHER; else { - printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); + pr_warn("Unknown ARP type 0x%04x for device %s\n", dev->type, + dev->name); b->htype = dev->type; /* can cause undefined behavior */ } @@ -904,14 +895,12 @@ static void __init ic_do_bootp_ext(u8 *ext) int i; __be16 mtu; -#ifdef IPCONFIG_DEBUG u8 *c; - printk("DHCP/BOOTP: Got extension %d:",*ext); + pr_debug("DHCP/BOOTP: Got extension %d:", *ext); for (c=ext+2; c<ext+2+ext[1]; c++) - printk(" %02x", *c); - printk("\n"); -#endif + pr_debug(" %02x", *c); + pr_debug("\n"); switch (*ext++) { case 1: /* Subnet mask */ @@ -1080,9 +1069,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str } } -#ifdef IPCONFIG_DEBUG - printk("DHCP: Got message type %d\n", mt); -#endif + pr_debug("DHCP: Got message type %d\n", mt); switch (mt) { case DHCPOFFER: @@ -1095,10 +1082,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Let's accept that offer. */ ic_myaddr = b->your_ip; ic_servaddr = server_id; -#ifdef IPCONFIG_DEBUG - printk("DHCP: Offered address %pI4 by server %pI4\n", - &ic_myaddr, &b->iph.saddr); -#endif + pr_debug("DHCP: Offered address %pI4 by server %pI4\n", + &ic_myaddr, &b->iph.saddr); /* The DHCP indicated server address takes * precedence over the bootp header one if * they are different. @@ -1295,11 +1280,10 @@ static int __init ic_dynamic(void) return -1; } - printk("IP-Config: Got %s answer from %pI4, ", + pr_info("IP-Config: Got %s answer from %pI4, my address is %pI4\n", ((ic_got_reply & IC_RARP) ? "RARP" - : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), - &ic_addrservaddr); - pr_cont("my address is %pI4\n", &ic_myaddr); + : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), + &ic_addrservaddr, &ic_myaddr); return 0; } @@ -1426,7 +1410,7 @@ static int __init ip_auto_config(void) if (!ic_enable) return 0; - DBG(("IP-Config: Entered.\n")); + pr_debug("IP-Config: Entered.\n"); #ifdef IPCONFIG_DYNAMIC try_try_again: #endif @@ -1542,7 +1526,7 @@ static int __init ip_auto_config(void) pr_cont(", mtu=%d", ic_dev_mtu); for (i = 0; i < CONF_NAMESERVERS_MAX; i++) if (ic_nameservers[i] != NONE) { - pr_info(" nameserver%u=%pI4", + pr_cont(" nameserver%u=%pI4", i, &ic_nameservers[i]); break; } @@ -1585,7 +1569,7 @@ static int __init ic_proto_name(char *name) return 1; *v = 0; if (kstrtou8(client_id, 0, dhcp_client_identifier)) - DBG("DHCP: Invalid client identifier type\n"); + pr_debug("DHCP: Invalid client identifier type\n"); strncpy(dhcp_client_identifier + 1, v + 1, 251); *v = ','; } @@ -1644,7 +1628,7 @@ static int __init ip_auto_config_setup(char *addrs) if ((cp = strchr(ip, ':'))) *cp++ = '\0'; if (strlen(ip) > 0) { - DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + pr_debug("IP-Config: Parameter #%d: `%s'\n", num, ip); switch (num) { case 0: if ((ic_myaddr = in_aton(ip)) == ANY) @@ -1716,7 +1700,7 @@ static int __init vendor_class_identifier_setup(char *addrs) if (strlcpy(vendor_class_identifier, addrs, sizeof(vendor_class_identifier)) >= sizeof(vendor_class_identifier)) - pr_warn("DHCP: vendorclass too long, truncated to \"%s\"", + pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n", vendor_class_identifier); return 1; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f34c31defafe..4044da61e747 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -103,7 +103,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> @@ -253,9 +252,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) p.i_key = p.o_key = 0; p.i_flags = p.o_flags = 0; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 92dd4b74d513..395e2814a46d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -66,28 +66,7 @@ #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> - -#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) -#define CONFIG_IP_PIMSM 1 -#endif - -struct mr_table { - struct list_head list; - possible_net_t net; - u32 id; - struct sock __rcu *mroute_sk; - struct timer_list ipmr_expire_timer; - struct list_head mfc_unres_queue; - struct list_head mfc_cache_array[MFC_LINES]; - struct vif_device vif_table[MAXVIFS]; - int maxvif; - atomic_t cache_resolve_queue_len; - bool mroute_do_assert; - bool mroute_do_pim; -#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) - int mroute_reg_vif_num; -#endif -}; +#include <net/nexthop.h> struct ipmr_rule { struct fib_rule common; @@ -103,11 +82,7 @@ struct ipmr_result { static DEFINE_RWLOCK(mrt_lock); -/* - * Multicast router control variables - */ - -#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) +/* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -134,7 +109,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); -static void mroute_clean_tables(struct mr_table *mrt); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -252,8 +227,8 @@ static int __net_init ipmr_rules_init(struct net *net) INIT_LIST_HEAD(&net->ipv4.mr_tables); mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); - if (!mrt) { - err = -ENOMEM; + if (IS_ERR(mrt)) { + err = PTR_ERR(mrt); goto err1; } @@ -301,8 +276,13 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, static int __net_init ipmr_rules_init(struct net *net) { - net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); - return net->ipv4.mrt ? 0 : -ENOMEM; + struct mr_table *mrt; + + mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); + if (IS_ERR(mrt)) + return PTR_ERR(mrt); + net->ipv4.mrt = mrt; + return 0; } static void __net_exit ipmr_rules_exit(struct net *net) @@ -319,13 +299,17 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) struct mr_table *mrt; unsigned int i; + /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ + if (id != RT_TABLE_DEFAULT && id >= 1000000000) + return ERR_PTR(-EINVAL); + mrt = ipmr_get_table(net, id); if (mrt) return mrt; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); if (!mrt) - return NULL; + return ERR_PTR(-ENOMEM); write_pnet(&mrt->net, net); mrt->id = id; @@ -338,9 +322,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, (unsigned long)mrt); -#ifdef CONFIG_IP_PIMSM mrt->mroute_reg_vif_num = -1; -#endif #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); #endif @@ -350,7 +332,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -387,8 +369,24 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) } } -static -struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) +/* Initialize ipmr pimreg/tunnel in_device */ +static bool ipmr_init_vif_indev(const struct net_device *dev) +{ + struct in_device *in_dev; + + ASSERT_RTNL(); + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + return false; + ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); + IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; + + return true; +} + +static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *dev; @@ -399,7 +397,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) int err; struct ifreq ifr; struct ip_tunnel_parm p; - struct in_device *in_dev; memset(&p, 0, sizeof(p)); p.iph.daddr = v->vifc_rmt_addr.s_addr; @@ -424,15 +421,8 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) if (err == 0 && (dev = __dev_get_by_name(net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; - - in_dev = __in_dev_get_rtnl(dev); - if (!in_dev) + if (!ipmr_init_vif_indev(dev)) goto failure; - - ipv4_devconf_setall(in_dev); - neigh_parms_data_state_setall(in_dev->arp_parms); - IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; - if (dev_open(dev)) goto failure; dev_hold(dev); @@ -441,16 +431,11 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } -#ifdef CONFIG_IP_PIMSM - +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); @@ -500,7 +485,6 @@ static void reg_vif_setup(struct net_device *dev) static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; - struct in_device *in_dev; char name[IFNAMSIZ]; if (mrt->id == RT_TABLE_DEFAULT) @@ -520,18 +504,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) return NULL; } - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (!in_dev) { - rcu_read_unlock(); + if (!ipmr_init_vif_indev(dev)) goto failure; - } - - ipv4_devconf_setall(in_dev); - neigh_parms_data_state_setall(in_dev->arp_parms); - IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; - rcu_read_unlock(); - if (dev_open(dev)) goto failure; @@ -540,20 +514,59 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } + +/* called with rcu_read_lock() */ +static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, + unsigned int pimlen) +{ + struct net_device *reg_dev = NULL; + struct iphdr *encap; + + encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); + /* Check that: + * a. packet is really sent to a multicast group + * b. packet is not a NULL-REGISTER + * c. packet is not truncated + */ + if (!ipv4_is_multicast(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + pimlen > skb->len) + return 1; + + read_lock(&mrt_lock); + if (mrt->mroute_reg_vif_num >= 0) + reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; + read_unlock(&mrt_lock); + + if (!reg_dev) + return 1; + + skb->mac_header = skb->network_header; + skb_pull(skb, (u8 *)encap - skb->data); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = CHECKSUM_NONE; + + skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); + + netif_rx(skb); + + return NET_RX_SUCCESS; +} +#else +static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) +{ + return NULL; +} #endif /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call */ - static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { @@ -575,10 +588,8 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, return -EADDRNOTAVAIL; } -#ifdef CONFIG_IP_PIMSM if (vifi == mrt->mroute_reg_vif_num) mrt->mroute_reg_vif_num = -1; -#endif if (vifi + 1 == mrt->maxvif) { int tmp; @@ -625,7 +636,6 @@ static inline void ipmr_cache_free(struct mfc_cache *c) /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. */ - static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { struct net *net = read_pnet(&mrt->net); @@ -653,9 +663,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) ipmr_cache_free(c); } - /* Timer process for the unresolved queue. */ - static void ipmr_expire_process(unsigned long arg) { struct mr_table *mrt = (struct mr_table *)arg; @@ -695,7 +703,6 @@ out: } /* Fill oifs list. It is called under write locked mrt_lock. */ - static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, unsigned char *ttls) { @@ -731,10 +738,10 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EADDRINUSE; switch (vifc->vifc_flags) { -#ifdef CONFIG_IP_PIMSM case VIFF_REGISTER: - /* - * Special Purpose VIF in PIM + if (!ipmr_pimsm_enabled()) + return -EINVAL; + /* Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) @@ -749,7 +756,6 @@ static int vif_add(struct net *net, struct mr_table *mrt, return err; } break; -#endif case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); if (!dev) @@ -761,7 +767,6 @@ static int vif_add(struct net *net, struct mr_table *mrt, return err; } break; - case VIFF_USE_IFINDEX: case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { @@ -815,10 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* And finish update writing critical data */ write_lock_bh(&mrt_lock); v->dev = dev; -#ifdef CONFIG_IP_PIMSM if (v->flags & VIFF_REGISTER) mrt->mroute_reg_vif_num = vifi; -#endif if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); @@ -883,9 +886,7 @@ skip: return ipmr_cache_find_any_parent(mrt, vifi); } -/* - * Allocate a multicast cache entry - */ +/* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); @@ -906,10 +907,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) return c; } -/* - * A cache entry has gone into a resolved state from queued - */ - +/* A cache entry has gone into a resolved state from queued */ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc_cache *uc, struct mfc_cache *c) { @@ -917,7 +915,6 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsgerr *e; /* Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); @@ -941,34 +938,29 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, } } -/* - * Bounce a cache query up to mrouted. We could use netlink for this but mrouted - * expects the following bizarre scheme. +/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme. * - * Called under mrt_lock. + * Called under mrt_lock. */ - static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { - struct sk_buff *skb; const int ihl = ip_hdrlen(pkt); + struct sock *mroute_sk; struct igmphdr *igmp; struct igmpmsg *msg; - struct sock *mroute_sk; + struct sk_buff *skb; int ret; -#ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else -#endif skb = alloc_skb(128, GFP_ATOMIC); if (!skb) return -ENOBUFS; -#ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. @@ -986,28 +978,23 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); - } else -#endif - { - - /* Copy the IP header */ - - skb_set_network_header(skb, skb->len); - skb_put(skb, ihl); - skb_copy_to_linear_data(skb, pkt->data, ihl); - ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ - msg = (struct igmpmsg *)skb_network_header(skb); - msg->im_vif = vifi; - skb_dst_set(skb, dst_clone(skb_dst(pkt))); - - /* Add our header */ - - igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); - igmp->type = - msg->im_msgtype = assert; - igmp->code = 0; - ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ - skb->transport_header = skb->network_header; + } else { + /* Copy the IP header */ + skb_set_network_header(skb, skb->len); + skb_put(skb, ihl); + skb_copy_to_linear_data(skb, pkt->data, ihl); + /* Flag to the kernel this is a route add */ + ip_hdr(skb)->protocol = 0; + msg = (struct igmpmsg *)skb_network_header(skb); + msg->im_vif = vifi; + skb_dst_set(skb, dst_clone(skb_dst(pkt))); + /* Add our header */ + igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp->type = assert; + msg->im_msgtype = assert; + igmp->code = 0; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + skb->transport_header = skb->network_header; } rcu_read_lock(); @@ -1019,7 +1006,6 @@ static int ipmr_cache_report(struct mr_table *mrt, } /* Deliver to mrouted */ - ret = sock_queue_rcv_skb(mroute_sk, skb); rcu_read_unlock(); if (ret < 0) { @@ -1030,12 +1016,9 @@ static int ipmr_cache_report(struct mr_table *mrt, return ret; } -/* - * Queue a packet for resolution. It gets locked cache entry! - */ - -static int -ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) +/* Queue a packet for resolution. It gets locked cache entry! */ +static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, + struct sk_buff *skb) { bool found = false; int err; @@ -1053,7 +1036,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) if (!found) { /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || (c = ipmr_cache_alloc_unres()) == NULL) { spin_unlock_bh(&mfc_unres_lock); @@ -1063,13 +1045,11 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } /* Fill in the new cache entry */ - c->mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ - err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry @@ -1091,7 +1071,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } /* See if we can append the packet */ - if (c->mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; @@ -1104,9 +1083,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) return err; } -/* - * MFC cache manipulation by user space mroute daemon - */ +/* MFC cache manipulation by user space mroute daemon */ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { @@ -1177,9 +1154,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); - /* - * Check to see if we resolved a queued list. If so we - * need to send on the frames and tidy up. + /* Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); @@ -1204,29 +1180,25 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, return 0; } -/* - * Close the multicast socket, and clear the vif tables etc - */ - -static void mroute_clean_tables(struct mr_table *mrt) +/* Close the multicast socket, and clear the vif tables etc */ +static void mroute_clean_tables(struct mr_table *mrt, bool all) { int i; LIST_HEAD(list); struct mfc_cache *c, *next; /* Shut down all active vif entries */ - for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags & VIFF_STATIC)) - vif_delete(mrt, i, 0, &list); + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) + continue; + vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); /* Wipe the cache */ - for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); @@ -1261,50 +1233,58 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); } } rtnl_unlock(); } -/* - * Socket options and virtual interface manipulation. The whole - * virtual interface system is a complete heap, but unfortunately - * that's how BSD mrouted happens to think. Maybe one day with a proper - * MOSPF/PIM router set up we can clean this up. +/* Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. */ -int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, + unsigned int optlen) { - int ret, parent = 0; - struct vifctl vif; - struct mfcctl mfc; struct net *net = sock_net(sk); + int val, ret = 0, parent = 0; struct mr_table *mrt; + struct vifctl vif; + struct mfcctl mfc; + u32 uval; + /* There's one exception to the lock - MRT_DONE which needs to unlock */ + rtnl_lock(); if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->inet_num != IPPROTO_IGMP) - return -EOPNOTSUPP; + inet_sk(sk)->inet_num != IPPROTO_IGMP) { + ret = -EOPNOTSUPP; + goto out_unlock; + } mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); - if (!mrt) - return -ENOENT; - + if (!mrt) { + ret = -ENOENT; + goto out_unlock; + } if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && - !ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EACCES; + !ns_capable(net->user_ns, CAP_NET_ADMIN)) { + ret = -EACCES; + goto out_unlock; + } } switch (optname) { case MRT_INIT: - if (optlen != sizeof(int)) - return -EINVAL; - - rtnl_lock(); + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } if (rtnl_dereference(mrt->mroute_sk)) { - rtnl_unlock(); - return -EADDRINUSE; + ret = -EADDRINUSE; + break; } ret = ip_ra_control(sk, 1, mrtsock_destruct); @@ -1315,129 +1295,133 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } - rtnl_unlock(); - return ret; + break; case MRT_DONE: - if (sk != rcu_access_pointer(mrt->mroute_sk)) - return -EACCES; - return ip_ra_control(sk, 0, NULL); + if (sk != rcu_access_pointer(mrt->mroute_sk)) { + ret = -EACCES; + } else { + /* We need to unlock here because mrtsock_destruct takes + * care of rtnl itself and we can't change that due to + * the IP_ROUTER_ALERT setsockopt which runs without it. + */ + rtnl_unlock(); + ret = ip_ra_control(sk, 0, NULL); + goto out; + } + break; case MRT_ADD_VIF: case MRT_DEL_VIF: - if (optlen != sizeof(vif)) - return -EINVAL; - if (copy_from_user(&vif, optval, sizeof(vif))) - return -EFAULT; - if (vif.vifc_vifi >= MAXVIFS) - return -ENFILE; - rtnl_lock(); + if (optlen != sizeof(vif)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&vif, optval, sizeof(vif))) { + ret = -EFAULT; + break; + } + if (vif.vifc_vifi >= MAXVIFS) { + ret = -ENFILE; + break; + } if (optname == MRT_ADD_VIF) { ret = vif_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } - rtnl_unlock(); - return ret; - - /* - * Manipulate the forwarding caches. These live - * in a sort of kernel/user symbiosis. - */ + break; + /* Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: - if (optlen != sizeof(mfc)) - return -EINVAL; - if (copy_from_user(&mfc, optval, sizeof(mfc))) - return -EFAULT; + if (optlen != sizeof(mfc)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&mfc, optval, sizeof(mfc))) { + ret = -EFAULT; + break; + } if (parent == 0) parent = mfc.mfcc_parent; - rtnl_lock(); if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); - rtnl_unlock(); - return ret; - /* - * Control PIM assert. - */ + break; + /* Control PIM assert. */ case MRT_ASSERT: - { - int v; - if (optlen != sizeof(v)) - return -EINVAL; - if (get_user(v, (int __user *)optval)) - return -EFAULT; - mrt->mroute_do_assert = v; - return 0; - } -#ifdef CONFIG_IP_PIMSM + if (optlen != sizeof(val)) { + ret = -EINVAL; + break; + } + if (get_user(val, (int __user *)optval)) { + ret = -EFAULT; + break; + } + mrt->mroute_do_assert = val; + break; case MRT_PIM: - { - int v; - - if (optlen != sizeof(v)) - return -EINVAL; - if (get_user(v, (int __user *)optval)) - return -EFAULT; - v = !!v; + if (!ipmr_pimsm_enabled()) { + ret = -ENOPROTOOPT; + break; + } + if (optlen != sizeof(val)) { + ret = -EINVAL; + break; + } + if (get_user(val, (int __user *)optval)) { + ret = -EFAULT; + break; + } - rtnl_lock(); - ret = 0; - if (v != mrt->mroute_do_pim) { - mrt->mroute_do_pim = v; - mrt->mroute_do_assert = v; + val = !!val; + if (val != mrt->mroute_do_pim) { + mrt->mroute_do_pim = val; + mrt->mroute_do_assert = val; } - rtnl_unlock(); - return ret; - } -#endif -#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + break; case MRT_TABLE: - { - u32 v; - - if (optlen != sizeof(u32)) - return -EINVAL; - if (get_user(v, (u32 __user *)optval)) - return -EFAULT; - - /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ - if (v != RT_TABLE_DEFAULT && v >= 1000000000) - return -EINVAL; + if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) { + ret = -ENOPROTOOPT; + break; + } + if (optlen != sizeof(uval)) { + ret = -EINVAL; + break; + } + if (get_user(uval, (u32 __user *)optval)) { + ret = -EFAULT; + break; + } - rtnl_lock(); - ret = 0; if (sk == rtnl_dereference(mrt->mroute_sk)) { ret = -EBUSY; } else { - if (!ipmr_new_table(net, v)) - ret = -ENOMEM; + mrt = ipmr_new_table(net, uval); + if (IS_ERR(mrt)) + ret = PTR_ERR(mrt); else - raw_sk(sk)->ipmr_table = v; + raw_sk(sk)->ipmr_table = uval; } - rtnl_unlock(); - return ret; - } -#endif - /* - * Spurious command, or MRT_VERSION which you cannot - * set. - */ + break; + /* Spurious command, or MRT_VERSION which you cannot set. */ default: - return -ENOPROTOOPT; + ret = -ENOPROTOOPT; } +out_unlock: + rtnl_unlock(); +out: + return ret; } -/* - * Getsock opt support for the multicast routing system. - */ - +/* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int olr; @@ -1453,39 +1437,35 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int if (!mrt) return -ENOENT; - if (optname != MRT_VERSION && -#ifdef CONFIG_IP_PIMSM - optname != MRT_PIM && -#endif - optname != MRT_ASSERT) + switch (optname) { + case MRT_VERSION: + val = 0x0305; + break; + case MRT_PIM: + if (!ipmr_pimsm_enabled()) + return -ENOPROTOOPT; + val = mrt->mroute_do_pim; + break; + case MRT_ASSERT: + val = mrt->mroute_do_assert; + break; + default: return -ENOPROTOOPT; + } if (get_user(olr, optlen)) return -EFAULT; - olr = min_t(unsigned int, olr, sizeof(int)); if (olr < 0) return -EINVAL; - if (put_user(olr, optlen)) return -EFAULT; - if (optname == MRT_VERSION) - val = 0x0305; -#ifdef CONFIG_IP_PIMSM - else if (optname == MRT_PIM) - val = mrt->mroute_do_pim; -#endif - else - val = mrt->mroute_do_assert; if (copy_to_user(optval, &val, olr)) return -EFAULT; return 0; } -/* - * The IP multicast ioctl support routines. - */ - +/* The IP multicast ioctl support routines. */ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req sr; @@ -1618,7 +1598,6 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) } #endif - static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -1640,17 +1619,14 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v return NOTIFY_DONE; } - static struct notifier_block ip_mr_notifier = { .notifier_call = ipmr_device_event, }; -/* - * Encapsulate a packet by attaching a valid IPIP header to it. - * This avoids tunnel drivers and other mess and gives us the speed so - * important for multicast video. +/* Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. */ - static void ip_encap(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { @@ -1692,9 +1668,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -/* - * Processing handlers for ipmr_forward - */ +/* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, int vifi) @@ -1709,7 +1683,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (!vif->dev) goto out_free; -#ifdef CONFIG_IP_PIMSM if (vif->flags & VIFF_REGISTER) { vif->pkt_out++; vif->bytes_out += skb->len; @@ -1718,7 +1691,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } -#endif if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, @@ -1745,7 +1717,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * allow to send ICMP, so that packets will disappear * to blackhole. */ - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; @@ -1777,8 +1748,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPCB(skb)->flags |= IPSKB_FORWARDED; - /* - * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface @@ -1809,7 +1779,6 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) } /* "local" means that we should preserve one skb (for local delivery) */ - static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local) @@ -1834,9 +1803,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto forward; } - /* - * Wrong interface: drop packet and (maybe) send PIM assert. - */ + /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. @@ -1875,9 +1842,7 @@ forward: mrt->vif_table[vif].pkt_in++; mrt->vif_table[vif].bytes_in += skb->len; - /* - * Forward the frame - */ + /* Forward the frame */ if (cache->mfc_origin == htonl(INADDR_ANY) && cache->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && @@ -1951,11 +1916,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) return mrt; } -/* - * Multicast packets for forwarding arrive here - * Called with rcu_read_lock(); +/* Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); */ - int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; @@ -2006,9 +1969,7 @@ int ip_mr_input(struct sk_buff *skb) vif); } - /* - * No usable cache entry - */ + /* No usable cache entry */ if (!cache) { int vif; @@ -2049,53 +2010,8 @@ dont_forward: return 0; } -#ifdef CONFIG_IP_PIMSM -/* called with rcu_read_lock() */ -static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, - unsigned int pimlen) -{ - struct net_device *reg_dev = NULL; - struct iphdr *encap; - - encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); - /* - * Check that: - * a. packet is really sent to a multicast group - * b. packet is not a NULL-REGISTER - * c. packet is not truncated - */ - if (!ipv4_is_multicast(encap->daddr) || - encap->tot_len == 0 || - ntohs(encap->tot_len) + pimlen > skb->len) - return 1; - - read_lock(&mrt_lock); - if (mrt->mroute_reg_vif_num >= 0) - reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - read_unlock(&mrt_lock); - - if (!reg_dev) - return 1; - - skb->mac_header = skb->network_header; - skb_pull(skb, (u8 *)encap - skb->data); - skb_reset_network_header(skb); - skb->protocol = htons(ETH_P_IP); - skb->ip_summed = CHECKSUM_NONE; - - skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); - - netif_rx(skb); - - return NET_RX_SUCCESS; -} -#endif - #ifdef CONFIG_IP_PIMSM_V1 -/* - * Handle IGMP messages of PIMv1 - */ - +/* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; @@ -2256,8 +2172,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } read_lock(&mrt_lock); - if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) - cache->mfc_flags |= MFC_NOTIFY; err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); rcu_read_unlock(); @@ -2419,10 +2333,133 @@ done: return skb->len; } +static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { + [RTA_SRC] = { .type = NLA_U32 }, + [RTA_DST] = { .type = NLA_U32 }, + [RTA_IIF] = { .type = NLA_U32 }, + [RTA_TABLE] = { .type = NLA_U32 }, + [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, +}; + +static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol) +{ + switch (rtm_protocol) { + case RTPROT_STATIC: + case RTPROT_MROUTED: + return true; + } + return false; +} + +static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) +{ + struct rtnexthop *rtnh = nla_data(nla); + int remaining = nla_len(nla), vifi = 0; + + while (rtnh_ok(rtnh, remaining)) { + mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops; + if (++vifi == MAXVIFS) + break; + rtnh = rtnh_next(rtnh, &remaining); + } + + return remaining > 0 ? -EINVAL : vifi; +} + +/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ +static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, + struct mfcctl *mfcc, int *mrtsock, + struct mr_table **mrtret) +{ + struct net_device *dev = NULL; + u32 tblid = RT_TABLE_DEFAULT; + struct mr_table *mrt; + struct nlattr *attr; + struct rtmsg *rtm; + int ret, rem; + + ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy); + if (ret < 0) + goto out; + rtm = nlmsg_data(nlh); + + ret = -EINVAL; + if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 || + rtm->rtm_type != RTN_MULTICAST || + rtm->rtm_scope != RT_SCOPE_UNIVERSE || + !ipmr_rtm_validate_proto(rtm->rtm_protocol)) + goto out; + + memset(mfcc, 0, sizeof(*mfcc)); + mfcc->mfcc_parent = -1; + ret = 0; + nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) { + switch (nla_type(attr)) { + case RTA_SRC: + mfcc->mfcc_origin.s_addr = nla_get_be32(attr); + break; + case RTA_DST: + mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); + break; + case RTA_IIF: + dev = __dev_get_by_index(net, nla_get_u32(attr)); + if (!dev) { + ret = -ENODEV; + goto out; + } + break; + case RTA_MULTIPATH: + if (ipmr_nla_get_ttls(attr, mfcc) < 0) { + ret = -EINVAL; + goto out; + } + break; + case RTA_PREFSRC: + ret = 1; + break; + case RTA_TABLE: + tblid = nla_get_u32(attr); + break; + } + } + mrt = ipmr_get_table(net, tblid); + if (!mrt) { + ret = -ENOENT; + goto out; + } + *mrtret = mrt; + *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; + if (dev) + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); + +out: + return ret; +} + +/* takes care of both newroute and delroute */ +static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + int ret, mrtsock, parent; + struct mr_table *tbl; + struct mfcctl mfcc; + + mrtsock = 0; + tbl = NULL; + ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl); + if (ret < 0) + return ret; + + parent = ret ? mfcc.mfcc_parent : -1; + if (nlh->nlmsg_type == RTM_NEWROUTE) + return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); + else + return ipmr_mfc_delete(tbl, &mfcc, parent); +} + #ifdef CONFIG_PROC_FS -/* - * The /proc interfaces to multicast routing : - * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif +/* The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; @@ -2706,10 +2743,7 @@ static const struct net_protocol pim_protocol = { }; #endif - -/* - * Setup for IP multicast routing - */ +/* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; @@ -2759,8 +2793,6 @@ int __init ip_mr_init(void) sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - if (!mrt_cachep) - return -ENOMEM; err = register_pernet_subsys(&ipmr_net_ops); if (err) @@ -2778,6 +2810,10 @@ int __init ip_mr_init(void) #endif rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute, NULL); + rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, + ipmr_rtm_route, NULL, NULL); + rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, + ipmr_rtm_route, NULL, NULL); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a35584176535..c187c60e3e0c 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -60,6 +60,7 @@ config NFT_REJECT_IPV4 config NFT_DUP_IPV4 tristate "IPv4 nf_tables packet duplication support" + depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 help This module enables IPv4 packet duplication support for nf_tables. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 11dccba474b7..b488cac9c5ca 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -38,13 +38,13 @@ MODULE_DESCRIPTION("arptables core"); /*#define DEBUG_ARP_TABLES_USER*/ #ifdef DEBUG_ARP_TABLES -#define dprintf(format, args...) printk(format , ## args) +#define dprintf(format, args...) pr_debug(format, ## args) #else #define dprintf(format, args...) #endif #ifdef DEBUG_ARP_TABLES_USER -#define duprintf(format, args...) printk(format , ## args) +#define duprintf(format, args...) pr_debug(format, ## args) #else #define duprintf(format, args...) #endif @@ -1905,7 +1905,7 @@ static int __init arp_tables_init(void) if (ret < 0) goto err4; - printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n"); + pr_info("arp_tables: (C) 2002 David S. Miller\n"); return 0; err4: diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 461ca926fd39..e3c46e8e2762 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -451,7 +451,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { - printk(KERN_ERR "Unable to register netfilter socket option\n"); + pr_err("Unable to register netfilter socket option\n"); return ret; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 5075b7ecd26d..61c7cc22ea68 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -132,7 +132,8 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { + (!skb->dev || skb->dev->features & + (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 657d2307f031..b3ca21b2ba9b 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct net *net = nf_ct_net(ct); const struct nf_conn *master = ct->master; struct nf_conntrack_expect *other_exp; - struct nf_conntrack_tuple t; + struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_range range; diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ddb894ac1458..c9b52c361da2 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1048,7 +1048,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (!asn1_uint_decode (&ctx, end, &vers)) return 0; if (debug > 1) - printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); + pr_debug("bsalg: snmp version: %u\n", vers + 1); if (vers > 1) return 1; @@ -1064,10 +1064,10 @@ static int snmp_parse_mangle(unsigned char *msg, if (debug > 1) { unsigned int i; - printk(KERN_DEBUG "bsalg: community: "); + pr_debug("bsalg: community: "); for (i = 0; i < comm.len; i++) - printk("%c", comm.data[i]); - printk("\n"); + pr_cont("%c", comm.data[i]); + pr_cont("\n"); } kfree(comm.data); @@ -1091,9 +1091,9 @@ static int snmp_parse_mangle(unsigned char *msg, }; if (pdutype > SNMP_PDU_TRAP2) - printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); + pr_debug("bsalg: bad pdu type %u\n", pdutype); else - printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); + pr_debug("bsalg: pdu: %s\n", pdus[pdutype]); } if (pdutype != SNMP_PDU_RESPONSE && pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) @@ -1119,7 +1119,7 @@ static int snmp_parse_mangle(unsigned char *msg, return 0; if (debug > 1) - printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " + pr_debug("bsalg: request: id=0x%lx error_status=%u " "error_index=%u\n", req.id, req.error_status, req.error_index); } @@ -1145,13 +1145,13 @@ static int snmp_parse_mangle(unsigned char *msg, } if (debug > 1) { - printk(KERN_DEBUG "bsalg: object: "); + pr_debug("bsalg: object: "); for (i = 0; i < obj->id_len; i++) { if (i > 0) - printk("."); - printk("%lu", obj->id[i]); + pr_cont("."); + pr_cont("%lu", obj->id[i]); } - printk(": type=%u\n", obj->type); + pr_cont(": type=%u\n", obj->type); } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index c747b2d9eb77..b6ea57ec5e14 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -14,7 +14,6 @@ #include <net/netfilter/ipv4/nf_reject.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> -#include <net/netfilter/ipv4/nf_reject.h> const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook) diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 9d09d4f59545..cd84d4295a20 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -57,7 +57,7 @@ err: static void nf_tables_arp_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.arp); + nft_unregister_afinfo(net, net->nft.arp); kfree(net->nft.arp); } diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index ca9dc3c46c4f..e44ba3b12fbb 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -78,7 +78,7 @@ err: static void nf_tables_ipv4_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.ipv4); + nft_unregister_afinfo(net, net->nft.ipv4); kfree(net->nft.ipv4); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e89094ab5ddb..c117b21b937d 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1063,6 +1063,7 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) + __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; @@ -1094,6 +1095,7 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) + __releases(ping_table.lock) { read_unlock_bh(&ping_table.lock); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8c0d0bdc2a7c..bc35f1842512 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -406,10 +406,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->transport_header += iphlen; + if (iph->protocol == IPPROTO_ICMP && + length >= iphlen + sizeof(struct icmphdr)) + icmp_out_count(net, ((struct icmphdr *) + skb_transport_header(skb))->type); } - if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(net, ((struct icmphdr *) - skb_transport_header(skb))->type); err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, @@ -599,8 +601,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); - if (!saddr && ipc.oif) - l3mdev_get_saddr(net, ipc.oif, &fl4); + if (!saddr && ipc.oif) { + err = l3mdev_get_saddr(net, ipc.oif, &fl4); + if (err < 0) + goto done; + } if (!inet->hdrincl) { rfv.msg = msg; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4cbe9f0a4281..643a86c49020 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack.v64 = 0; treq->tfo_listener = false; - ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, + flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a0bd7a55193e..4d367b4139a3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,7 +24,6 @@ #include <net/cipso_ipv4.h> #include <net/inet_frag.h> #include <net/ping.h> -#include <net/tcp_memcontrol.h> static int zero; static int one = 1; @@ -337,27 +336,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_keepalive_time", - .data = &sysctl_tcp_keepalive_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "tcp_keepalive_probes", - .data = &sysctl_tcp_keepalive_probes, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_keepalive_intvl", - .data = &sysctl_tcp_keepalive_intvl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "tcp_retries1", .data = &sysctl_tcp_retries1, .maxlen = sizeof(int), @@ -915,6 +893,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "tcp_l3mdev_accept", + .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, @@ -950,6 +939,27 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_keepalive_time", + .data = &init_net.ipv4.sysctl_tcp_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_keepalive_probes", + .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_keepalive_intvl", + .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cfa7c0c1e80..fd17eec93525 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -422,7 +422,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } @@ -451,11 +452,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + int state; sock_rps_record_flow(sk); sock_poll_wait(file, sk_sleep(sk), wait); - if (sk->sk_state == TCP_LISTEN) + + state = sk_state_load(sk); + if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events @@ -492,14 +496,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected or passive Fast Open socket? */ - if (sk->sk_state != TCP_SYN_SENT && - (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) { + if (state != TCP_SYN_SENT && + (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); if (tp->urg_seq == tp->copied_seq && @@ -507,9 +511,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - /* Potential race condition. If read of tp below will - * escape above sk->sk_state, we can be illegally awaken - * in SYN_* states. */ if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; @@ -517,8 +518,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_is_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - set_bit(SOCK_ASYNC_NOSPACE, - &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after @@ -906,7 +906,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err; } - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; @@ -1019,7 +1019,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, ssize_t res; if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) + !sk_check_csum_caps(sk)) return sock_no_sendpage(sk->sk_socket, page, offset, size, flags); @@ -1134,7 +1134,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } /* This should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); mss_now = tcp_send_mss(sk, &size_goal, flags); @@ -1176,7 +1176,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & NETIF_F_ALL_CSUM) + if (sk_check_csum_caps(sk)) skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); @@ -1934,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk->sk_state = state; + sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2644,7 +2644,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk->sk_state; + info->tcpi_state = sk_state_load(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2672,7 +2673,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - if (sk->sk_state == TCP_LISTEN) { + if (info->tcpi_state == TCP_LISTEN) { info->tcpi_unacked = sk->sk_ack_backlog; info->tcpi_sacked = sk->sk_max_ack_backlog; } else { @@ -3080,6 +3081,52 @@ void tcp_done(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_done); +int tcp_abort(struct sock *sk, int err) +{ + if (!sk_fullsock(sk)) { + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + + local_bh_disable(); + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, + req); + local_bh_enable(); + return 0; + } + sock_gen_put(sk); + return -EOPNOTSUPP; + } + + /* Don't race with userspace socket closes such as tcp_close. */ + lock_sock(sk); + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + inet_csk_listen_stop(sk); + } + + /* Don't race with BH socket closes such as inet_csk_listen_stop. */ + local_bh_disable(); + bh_lock_sock(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_err = err; + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + sk->sk_error_report(sk); + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + } + + bh_unlock_sock(sk); + local_bh_enable(); + release_sock(sk); + sock_put(sk); + return 0; +} +EXPORT_SYMBOL_GPL(tcp_abort); + extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 479f34946177..4d610934fb39 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -10,6 +10,8 @@ */ #include <linux/module.h> +#include <linux/net.h> +#include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/tcp.h> @@ -21,7 +23,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) { + if (sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { @@ -46,12 +48,29 @@ static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int tcp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req); + + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return sock_diag_destroy(sk, ECONNABORTED); +} +#endif + static const struct inet_diag_handler tcp_diag_handler = { .dump = tcp_diag_dump, .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = tcp_diag_destroy, +#endif }; static int __init tcp_diag_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd88c3803a6..0003d409fec5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2478,6 +2478,9 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, int newly_acked_sacked = prior_unsacked - (tp->packets_out - tp->sacked_out); + if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) + return; + tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + @@ -4481,19 +4484,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_buff *skb; + int err = -ENOMEM; + int data_len = 0; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size, sk->sk_allocation); + if (size > PAGE_SIZE) { + int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); + + data_len = npages << PAGE_SHIFT; + size = data_len + (size & ~PAGE_MASK); + } + skb = alloc_skb_with_frags(size - data_len, data_len, + PAGE_ALLOC_COSTLY_ORDER, + &err, sk->sk_allocation); if (!skb) goto err; + skb_put(skb, size - data_len); + skb->data_len = data_len; + skb->len = size; + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_from_msg(skb_put(skb, size), msg, size)) + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (err) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4509,7 +4527,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) err_free: kfree_skb(skb); err: - return -ENOMEM; + return err; + } static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) @@ -5667,6 +5686,7 @@ discard: } tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->copied_seq = tp->rcv_nxt; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6187,7 +6207,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init(req, &tmp_opt, skb, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ - inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; + inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); af_ops->init_req(req, sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 950e28c0cdf2..5ced3e4013e3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -73,7 +73,6 @@ #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/inet.h> @@ -587,7 +586,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; + struct tcp_md5sig_key *key = NULL; const __u8 *hash_location = NULL; unsigned char newhash[16]; int genhash; @@ -627,7 +626,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG hash_location = tcp_parse_md5sig_option(th); - if (!sk && hash_location) { + if (sk && sk_fullsock(sk)) { + key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) + &ip_hdr(skb)->saddr, AF_INET); + } else if (hash_location) { /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -651,10 +653,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; - } else { - key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, - AF_INET) : NULL; } if (key) { @@ -675,7 +673,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. @@ -683,6 +682,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; + BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != + offsetof(struct inet_timewait_sock, tw_bound_dev_if)); + arg.tos = ip_hdr(skb)->tos; ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -921,7 +923,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk)); + sock_owned_by_user(sk) || + lockdep_is_held(&sk->sk_lock.slock)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) @@ -1275,6 +1278,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, ireq = inet_rsk(req); sk_daddr_set(newsk, ireq->ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); + newsk->sk_bound_dev_if = ireq->ir_iif; newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); @@ -1492,7 +1496,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1704,7 +1708,9 @@ do_time_wait: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: - goto no_tcp_socket; + tcp_v4_send_reset(sk, skb); + inet_twsk_deschedule_put(inet_twsk(sk)); + goto discard_it; case TCP_TW_SUCCESS:; } goto discard_it; @@ -1720,8 +1726,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } @@ -1812,7 +1817,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2158,6 +2165,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); int rx_queue; + int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || @@ -2175,17 +2183,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - if (sk->sk_state == TCP_LISTEN) + state = sk_state_load(sk); + if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else - /* - * because we dont lock socket, we might find a transient negative value + /* Because we don't lock the socket, + * we might find a transient negative value. */ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", - i, src, srcp, dest, destp, sk->sk_state, + i, src, srcp, dest, destp, state, tp->write_seq - tp->snd_una, rx_queue, timer_active, @@ -2199,8 +2208,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sk->sk_state == TCP_LISTEN ? - (fastopenq ? fastopenq->max_qlen : 0) : + state == TCP_LISTEN ? + fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); } @@ -2334,11 +2343,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, -#endif + .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); @@ -2376,6 +2381,10 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; + net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; + net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; + net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c deleted file mode 100644 index 2379c1b4efb2..000000000000 --- a/net/ipv4/tcp_memcontrol.c +++ /dev/null @@ -1,233 +0,0 @@ -#include <net/tcp.h> -#include <net/tcp_memcontrol.h> -#include <net/sock.h> -#include <net/ip.h> -#include <linux/nsproxy.h> -#include <linux/memcontrol.h> -#include <linux/module.h> - -int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - /* - * The root cgroup does not use page_counters, but rather, - * rely on the data already collected by the network - * subsystem - */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return 0; - - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; - - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; - - page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); - - return 0; -} -EXPORT_SYMBOL(tcp_init_cgroup); - -void tcp_destroy_cgroup(struct mem_cgroup *memcg) -{ - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return; - - percpu_counter_destroy(&cg_proto->sockets_allocated); - - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_dec(&memcg_socket_limit_enabled); - -} -EXPORT_SYMBOL(tcp_destroy_cgroup); - -static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - struct cg_proto *cg_proto; - int i; - int ret; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return -EINVAL; - - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); - if (ret) - return ret; - - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { - /* - * The active bit needs to be written after the static_key - * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. - * - * We need to do this, because static_keys will span multiple - * sites, but we can't control their order. If we mark a socket - * as accounted, but the accounting functions are not patched in - * yet, we'll lose accounting. - * - * We never race with the readers in sock_update_memcg(), - * because when this value change, the code to process it is not - * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. - */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - } - - return 0; -} - -enum { - RES_USAGE, - RES_LIMIT, - RES_MAX_USAGE, - RES_FAILCNT, -}; - -static DEFINE_MUTEX(tcp_limit_mutex); - -static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; - int ret = 0; - - buf = strstrip(buf); - - switch (of_cft(of)->private) { - case RES_LIMIT: - /* see memcontrol.c */ - ret = page_counter_memparse(buf, "-1", &nr_pages); - if (ret) - break; - mutex_lock(&tcp_limit_mutex); - ret = tcp_update_limit(memcg, nr_pages); - mutex_unlock(&tcp_limit_mutex); - break; - default: - ret = -EINVAL; - break; - } - return ret ?: nbytes; -} - -static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); - u64 val; - - switch (cft->private) { - case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; - val *= PAGE_SIZE; - break; - case RES_USAGE: - if (!cg_proto) - val = atomic_long_read(&tcp_memory_allocated); - else - val = page_counter_read(&cg_proto->memory_allocated); - val *= PAGE_SIZE; - break; - case RES_FAILCNT: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.failcnt; - break; - case RES_MAX_USAGE: - if (!cg_proto) - return 0; - val = cg_proto->memory_allocated.watermark; - val *= PAGE_SIZE; - break; - default: - BUG(); - } - return val; -} - -static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return nbytes; - - switch (of_cft(of)->private) { - case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); - break; - case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; - break; - } - - return nbytes; -} - -static struct cftype tcp_files[] = { - { - .name = "kmem.tcp.limit_in_bytes", - .write = tcp_cgroup_write, - .read_u64 = tcp_cgroup_read, - .private = RES_LIMIT, - }, - { - .name = "kmem.tcp.usage_in_bytes", - .read_u64 = tcp_cgroup_read, - .private = RES_USAGE, - }, - { - .name = "kmem.tcp.failcnt", - .private = RES_FAILCNT, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { - .name = "kmem.tcp.max_usage_in_bytes", - .private = RES_MAX_USAGE, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { } /* terminate */ -}; - -static int __init tcp_memcontrol_init(void) -{ - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); - return 0; -} -__initcall(tcp_memcontrol_init); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ac6b1961ffeb..75632a925824 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -131,7 +131,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) - goto kill_with_rst; + return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || @@ -145,11 +145,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { -kill_with_rst: - inet_twsk_deschedule_put(tw); + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) return TCP_TW_RST; - } /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cb7ca569052c..fda379cd600d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2296,7 +2296,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, - sk_gfp_atomic(sk, GFP_ATOMIC))) + sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -2813,13 +2813,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. @@ -3150,7 +3153,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, err = 0, copied; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; struct sk_buff *syn_data; @@ -3188,17 +3191,18 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto fallback; syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); - copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { - kfree_skb(syn_data); - goto fallback; - } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; + if (space) { + int copied = copy_from_iter(skb_put(syn_data, space), space, + &fo->data->msg_iter); + if (unlikely(!copied)) { + kfree_skb(syn_data); + goto fallback; + } + if (copied != space) { + skb_trim(syn_data, copied); + space = copied; + } } - /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; @@ -3352,8 +3356,9 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); - if (!buff) { + buff = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); + if (unlikely(!buff)) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, @@ -3375,7 +3380,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ skb_mstamp_get(&buff->skb_mstamp); - tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); + tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); } EXPORT_SYMBOL_GPL(tcp_send_ack); @@ -3396,7 +3401,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; @@ -3409,7 +3415,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); NET_INC_STATS(sock_net(sk), mib); - return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); + return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } void tcp_send_window_probe(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c9c716a483e4..a4730a28b220 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -24,9 +24,6 @@ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME; -int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES; -int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; @@ -168,7 +165,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data) + if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -176,6 +173,18 @@ static int tcp_write_timeout(struct sock *sk) syn_set = true; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + /* Some middle-boxes may black-hole Fast Open _after_ + * the handshake. Therefore we conservatively disable + * Fast Open on this path on recurring timeouts with + * few or zero bytes acked after Fast Open. + */ + if (tp->syn_data_acked && + tp->bytes_acked <= tp->rx_opt.mss_clamp) { + tcp_fastopen_cache_set(sk, 0, NULL, true, 0); + if (icsk->icsk_retransmits == sysctl_tcp_retries1) + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } /* Black hole detection */ tcp_mtu_probing(icsk, sk); diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 17d35662930d..3e6a472e6b88 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); - return tp->snd_cwnd - reduction; + return max_t(int, tp->snd_cwnd - reduction, 2); } static struct tcp_congestion_ops tcp_yeah __read_mostly = { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24ec14f9825c..dc45b538e237 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -100,7 +100,6 @@ #include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> @@ -114,6 +113,7 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" +#include <net/sock_reuseport.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -138,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int log) { struct sock *sk2; @@ -153,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); @@ -171,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2, + bool match_wildcard)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -187,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { res = 1; break; } @@ -197,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct net *net = sock_net(sk); + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + struct sock *sk2; + + sk_nulls_for_each(sk2, node, &hslot->head) { + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && + (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + (*saddr_same)(sk, sk2, false)) { + return reuseport_add_sock(sk, sk2); + } + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -208,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -291,6 +325,14 @@ found: udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { + if (sk->sk_reuseport && + udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + inet_sk(sk)->inet_num = 0; + udp_sk(sk)->udp_port_hash = 0; + udp_sk(sk)->udp_portaddr_hash ^= snum; + goto fail_unlock; + } + sk_nulls_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -310,13 +352,22 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return (!ipv6_only_sock(sk2) && - (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || - inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); + if (!ipv6_only_sock(sk2)) { + if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) + return 1; + if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) + return match_wildcard; + } + return 0; } static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, @@ -442,7 +493,8 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -460,8 +512,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -479,6 +538,7 @@ begin: if (get_nulls_value(node) != slot2) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -495,7 +555,7 @@ begin: */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -515,7 +575,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; @@ -525,7 +585,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } rcu_read_unlock(); return result; @@ -541,8 +601,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -561,6 +628,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, saddr, hnum, sport, @@ -582,13 +650,14 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable); + udptable, skb); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, + &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp4_lib_lookup); @@ -636,7 +705,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable); + iph->saddr, uh->source, skb->dev->ifindex, udptable, + NULL); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -773,7 +843,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, else if (skb_is_gso(skb)) uh->check = ~udp_v4_check(len, saddr, daddr, 0); else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) { + (skb_dst(skb)->dev->features & + (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); @@ -1026,8 +1097,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flow_flags, faddr, saddr, dport, inet->inet_sport); - if (!saddr && ipc.oif) - l3mdev_get_saddr(net, ipc.oif, fl4); + if (!saddr && ipc.oif) { + err = l3mdev_get_saddr(net, ipc.oif, fl4); + if (err < 0) + goto out; + } security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); @@ -1271,6 +1345,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; bool slow; if (flags & MSG_ERRQUEUE) @@ -1296,11 +1371,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { @@ -1396,6 +1472,8 @@ void udp_lib_unhash(struct sock *sk) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); if (sk_nulls_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; @@ -1423,22 +1501,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; - if (hslot2 != nhslot2) { + + if (hslot2 != nhslot2 || + rcu_access_pointer(sk->sk_reuseport_cb)) { hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); - - spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); - hslot2->count--; - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &nhslot2->head); - nhslot2->count++; - spin_unlock(&nhslot2->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + } spin_unlock_bh(&hslot->lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6116604bf6e8..df1966f3b6ec 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -44,7 +44,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -52,7 +52,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #endif else goto out_nosk; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f9386160cbee..4c519c1dc161 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,6 +21,7 @@ static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; struct udp_offload_priv { struct udp_offload *offload; + possible_net_t net; struct rcu_head rcu; struct udp_offload_priv __rcu *next; }; @@ -60,8 +61,9 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - (skb->dev->features & - (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); + ((skb->dev->features & NETIF_F_HW_CSUM) || + (skb->dev->features & (is_ipv6 ? + NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; @@ -241,13 +243,14 @@ out: return segs; } -int udp_add_offload(struct udp_offload *uo) +int udp_add_offload(struct net *net, struct udp_offload *uo) { struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); if (!new_offload) return -ENOMEM; + write_pnet(&new_offload->net, net); new_offload->offload = uo; spin_lock(&udp_offload_lock); @@ -311,7 +314,8 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (uo_priv->offload->port == uh->dest && + if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && + uo_priv->offload->port == uh->dest && uo_priv->offload->callbacks.gro_receive) goto unflush; } @@ -389,7 +393,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) uo_priv = rcu_dereference(udp_offload_base); for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (uo_priv->offload->port == uh->dest && + if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && + uo_priv->offload->port == uh->dest && uo_priv->offload->callbacks.gro_complete) break; } diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index aba428626b52..0ec08814f37d 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -74,10 +74,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); -int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) +void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, + __be16 df, __be16 src_port, __be16 dst_port, + bool xnet, bool nocheck) { struct udphdr *uh; @@ -91,8 +91,7 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, udp_set_csum(nocheck, skb, src, dst, skb->len); - return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, - tos, ttl, df, xnet); + iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1e0c3c835a63..7b0edb37a115 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -259,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm4_dst_ops = { +static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, @@ -273,7 +273,7 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .dst_ops = &xfrm4_dst_ops, + .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, @@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_init(struct net *net) +static int __net_init xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -323,7 +323,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm4_net_exit(struct net *net) +static void __net_exit xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -335,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm4_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm4_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm4_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, + sizeof(xfrm4_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); + if (ret) + return ret; + + ret = xfrm4_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); + + return ret; +} + +static void __net_exit xfrm4_net_exit(struct net *net) +{ + xfrm4_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); +} static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; -#endif static void __init xfrm4_policy_init(void) { @@ -349,13 +381,9 @@ static void __init xfrm4_policy_init(void) void __init xfrm4_init(void) { - dst_entries_init(&xfrm4_dst_ops); - xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); -#endif } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 983bb999738c..bb7dabe2ebbf 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -94,6 +94,7 @@ config IPV6_MIP6 config IPV6_ILA tristate "IPv6: Identifier Locator Addressing (ILA)" + depends on NETFILTER select LWTUNNEL ---help--- Support for IPv6 Identifier Locator Addressing (ILA). diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2c900c7b7eb1..2fbd90bf8d33 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -34,7 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o -obj-$(CONFIG_IPV6_ILA) += ila.o +obj-$(CONFIG_IPV6_ILA) += ila/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d84742f003a9..38eeddedfc21 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -70,7 +70,7 @@ #include <net/sock.h> #include <net/snmp.h> -#include <net/af_ieee802154.h> +#include <net/6lowpan.h> #include <net/firewire.h> #include <net/ipv6.h> #include <net/protocol.h> @@ -350,6 +350,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) setup_timer(&ndev->rs_timer, addrconf_rs_timer, (unsigned long)ndev); memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); + + if (ndev->cnf.stable_secret.initialized) + ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; + else + ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; + ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -1766,12 +1772,13 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) { + if (dad_failed) + ifp->flags |= IFA_F_DADFAILED; + if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; - if (dad_failed) - ifp->flags |= IFA_F_DADFAILED; spin_unlock_bh(&ifp->lock); if (dad_failed) ipv6_ifa_notify(0, ifp); @@ -1947,9 +1954,9 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) { - if (dev->addr_len != IEEE802154_ADDR_LEN) + if (dev->addr_len != EUI64_ADDR_LEN) return -1; - memcpy(eui, dev->dev_addr, 8); + memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN); eui[0] ^= 2; return 0; } @@ -2041,7 +2048,6 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) case ARPHRD_IPGRE: return addrconf_ifid_gre(eui, dev); case ARPHRD_6LOWPAN: - case ARPHRD_IEEE802154: return addrconf_ifid_eui64(eui, dev); case ARPHRD_IEEE1394: return addrconf_ifid_ieee1394(eui, dev); @@ -2314,6 +2320,12 @@ static void manage_tempaddrs(struct inet6_dev *idev, } } +static bool is_addr_mode_generate_stable(struct inet6_dev *idev) +{ + return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY || + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; +} + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; @@ -2427,8 +2439,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) in6_dev->token.s6_addr + 8, 8); read_unlock_bh(&in6_dev->lock); tokenized = true; - } else if (in6_dev->addr_gen_mode == - IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + } else if (is_addr_mode_generate_stable(in6_dev) && !ipv6_generate_stable_address(&addr, 0, in6_dev)) { addr_flags |= IFA_F_STABLE_PRIVACY; @@ -2455,7 +2466,7 @@ ok: #ifdef CONFIG_IPV6_OPTIMISTIC_DAD if (in6_dev->cnf.optimistic_dad && !net->ipv6.devconf_all->forwarding && sllao) - addr_flags = IFA_F_OPTIMISTIC; + addr_flags |= IFA_F_OPTIMISTIC; #endif /* Do not allow to create too much of autoconfigured @@ -3028,6 +3039,17 @@ retry: return 0; } +static void ipv6_gen_mode_random_init(struct inet6_dev *idev) +{ + struct ipv6_stable_secret *s = &idev->cnf.stable_secret; + + if (s->initialized) + return; + s = &idev->cnf.stable_secret; + get_random_bytes(&s->secret, sizeof(s->secret)); + s->initialized = true; +} + static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) { struct in6_addr addr; @@ -3038,13 +3060,18 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) { + switch (idev->addr_gen_mode) { + case IN6_ADDR_GEN_MODE_RANDOM: + ipv6_gen_mode_random_init(idev); + /* fallthrough */ + case IN6_ADDR_GEN_MODE_STABLE_PRIVACY: if (!ipv6_generate_stable_address(&addr, 0, idev)) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); - } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) { + break; + case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we * only need to care about prefix routes if ipv6_generate_eui64 * couldn't generate one. @@ -3053,6 +3080,11 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + break; + case IN6_ADDR_GEN_MODE_NONE: + default: + /* will not add any link local address */ + break; } } @@ -3066,10 +3098,10 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_FDDI) && (dev->type != ARPHRD_ARCNET) && (dev->type != ARPHRD_INFINIBAND) && - (dev->type != ARPHRD_IEEE802154) && (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && - (dev->type != ARPHRD_6LOWPAN)) { + (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_NONE)) { /* Alas, we support only Ethernet autoconfiguration. */ return; } @@ -3078,6 +3110,11 @@ static void addrconf_dev_config(struct net_device *dev) if (IS_ERR(idev)) return; + /* this device type has no EUI support */ + if (dev->type == ARPHRD_NONE && + idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) + idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM; + addrconf_addr_gen(idev, false); } @@ -3287,7 +3324,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_PRE_TYPE_CHANGE: case NETDEV_POST_TYPE_CHANGE: - addrconf_type_change(dev, event); + if (idev) + addrconf_type_change(dev, event); break; } @@ -3642,7 +3680,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, NULL); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); out: in6_ifa_put(ifp); rtnl_unlock(); @@ -4921,7 +4959,8 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (mode != IN6_ADDR_GEN_MODE_EUI64 && mode != IN6_ADDR_GEN_MODE_NONE && - mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY) + mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY && + mode != IN6_ADDR_GEN_MODE_RANDOM) return -EINVAL; if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY && @@ -5200,6 +5239,20 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, } static +int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table lctl; + int min_hl = 1, max_hl = 255; + + lctl = *ctl; + lctl.extra1 = &min_hl; + lctl.extra2 = &max_hl; + + return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos); +} + +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -5363,13 +5416,10 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, goto out; } - if (!write) { - err = snprintf(str, sizeof(str), "%pI6", - &secret->secret); - if (err >= sizeof(str)) { - err = -EIO; - goto out; - } + err = snprintf(str, sizeof(str), "%pI6", &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; } err = proc_dostring(&lctl, write, buffer, lenp, ppos); @@ -5454,7 +5504,7 @@ static struct addrconf_sysctl_table .data = &ipv6_devconf.hop_limit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = addrconf_sysctl_hop_limit, }, { .procname = "mtu", diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 882124ebb438..a8f6986dcbe5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -552,7 +552,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && ip6addrlbl_hold(p)) + if (p && !ip6addrlbl_hold(p)) p = NULL; lseq = ip6addrlbl_table.seq; rcu_read_unlock(); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 44bb66bde0e2..9f5137cd604e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; @@ -428,9 +431,11 @@ void inet6_destroy_sock(struct sock *sk) /* Free tx options */ - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } } EXPORT_SYMBOL_GPL(inet6_destroy_sock); @@ -659,7 +664,10 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - final_p = fl6_update_dst(&fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), + &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -668,7 +676,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return 0; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index d70b0238f468..517c55b01ba8 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -167,8 +167,10 @@ ipv4_connected: security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = flowlabel ? flowlabel->opt : np->opt; + rcu_read_lock(); + opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); + rcu_read_unlock(); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); err = 0; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ce203b0402be..ea7c4d64a00a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; + atomic_set(&opt2->refcnt, 1); } return opt2; } @@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); - + atomic_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36c5a98b0472..0a37ddc7af51 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -/* - * Special lock-class for __icmpv6_sk: - */ -static struct lock_class_key icmpv6_socket_sk_dst_lock_key; - static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; @@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk[i] = sk; - /* - * Split off their lock-class, because sk->sk_dst_lock - * gets used from softirqs, which is safe for - * __icmpv6_sk (because those never get directly used - * via userspace syscalls), but unsafe for normal sockets. - */ - lockdep_set_class(&sk->sk_dst_lock, - &icmpv6_socket_sk_dst_lock_key); - /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile new file mode 100644 index 000000000000..4b32e5921e5c --- /dev/null +++ b/net/ipv6/ila/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for ILA module +# + +obj-$(CONFIG_IPV6_ILA) += ila.o + +ila-objs := ila_common.o ila_lwt.o ila_xlat.o diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h new file mode 100644 index 000000000000..28542cb2b387 --- /dev/null +++ b/net/ipv6/ila/ila.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015 Tom Herbert <tom@herbertland.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ + +#ifndef __ILA_H +#define __ILA_H + +#include <linux/errno.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <uapi/linux/ila.h> + +struct ila_params { + __be64 locator; + __be64 locator_match; + __wsum csum_diff; +}; + +static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) +{ + __be32 diff[] = { + ~from[0], ~from[1], to[0], to[1], + }; + + return csum_partial(diff, sizeof(diff), 0); +} + +void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); + +int ila_lwt_init(void); +void ila_lwt_fini(void); +int ila_xlat_init(void); +void ila_xlat_fini(void); + +#endif /* __ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c new file mode 100644 index 000000000000..32dc9aab7297 --- /dev/null +++ b/net/ipv6/ila/ila_common.c @@ -0,0 +1,103 @@ +#include <linux/errno.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ip.h> +#include <net/ip6_fib.h> +#include <net/lwtunnel.h> +#include <net/protocol.h> +#include <uapi/linux/ila.h> +#include "ila.h" + +static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + if (*(__be64 *)&ip6h->daddr == p->locator_match) + return p->csum_diff; + else + return compute_csum_diff8((__be32 *)&ip6h->daddr, + (__be32 *)&p->locator); +} + +void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +{ + __wsum diff; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + size_t nhoff = sizeof(struct ipv6hdr); + + /* First update checksum */ + switch (ip6h->nexthdr) { + case NEXTHDR_TCP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { + struct tcphdr *th = (struct tcphdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&th->check, skb, + diff, true); + } + break; + case NEXTHDR_UDP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { + struct udphdr *uh = (struct udphdr *) + (skb_network_header(skb) + nhoff); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&uh->check, skb, + diff, true); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + break; + case NEXTHDR_ICMP: + if (likely(pskb_may_pull(skb, + nhoff + sizeof(struct icmp6hdr)))) { + struct icmp6hdr *ih = (struct icmp6hdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, + diff, true); + } + break; + } + + /* Now change destination address */ + *(__be64 *)&ip6h->daddr = p->locator; +} + +static int __init ila_init(void) +{ + int ret; + + ret = ila_lwt_init(); + + if (ret) + goto fail_lwt; + + ret = ila_xlat_init(); + if (ret) + goto fail_xlat; + + return 0; +fail_xlat: + ila_lwt_fini(); +fail_lwt: + return ret; +} + +static void __exit ila_fini(void) +{ + ila_xlat_fini(); + ila_lwt_fini(); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila.c b/net/ipv6/ila/ila_lwt.c index 1a6852e1ac69..2ae3c4fd8aab 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila/ila_lwt.c @@ -11,12 +11,7 @@ #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> - -struct ila_params { - __be64 locator; - __be64 locator_match; - __wsum csum_diff; -}; +#include "ila.h" static inline struct ila_params *ila_params_lwtunnel( struct lwtunnel_state *lwstate) @@ -24,73 +19,6 @@ static inline struct ila_params *ila_params_lwtunnel( return (struct ila_params *)lwstate->data; } -static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) -{ - __be32 diff[] = { - ~from[0], ~from[1], to[0], to[1], - }; - - return csum_partial(diff, sizeof(diff), 0); -} - -static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) -{ - if (*(__be64 *)&ip6h->daddr == p->locator_match) - return p->csum_diff; - else - return compute_csum_diff8((__be32 *)&ip6h->daddr, - (__be32 *)&p->locator); -} - -static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) -{ - __wsum diff; - struct ipv6hdr *ip6h = ipv6_hdr(skb); - size_t nhoff = sizeof(struct ipv6hdr); - - /* First update checksum */ - switch (ip6h->nexthdr) { - case NEXTHDR_TCP: - if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { - struct tcphdr *th = (struct tcphdr *) - (skb_network_header(skb) + nhoff); - - diff = get_csum_diff(ip6h, p); - inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); - } - break; - case NEXTHDR_UDP: - if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { - struct udphdr *uh = (struct udphdr *) - (skb_network_header(skb) + nhoff); - - if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { - diff = get_csum_diff(ip6h, p); - inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); - if (!uh->check) - uh->check = CSUM_MANGLED_0; - } - } - break; - case NEXTHDR_ICMP: - if (likely(pskb_may_pull(skb, - nhoff + sizeof(struct icmp6hdr)))) { - struct icmp6hdr *ih = (struct icmp6hdr *) - (skb_network_header(skb) + nhoff); - - diff = get_csum_diff(ip6h, p); - inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); - } - break; - } - - /* Now change destination address */ - *(__be64 *)&ip6h->daddr = p->locator; -} - static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -213,17 +141,12 @@ static const struct lwtunnel_encap_ops ila_encap_ops = { .cmp_encap = ila_encap_cmp, }; -static int __init ila_init(void) +int ila_lwt_init(void) { return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } -static void __exit ila_fini(void) +void ila_lwt_fini(void) { lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } - -module_init(ila_init); -module_exit(ila_fini); -MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c new file mode 100644 index 000000000000..295ca29a23c3 --- /dev/null +++ b/net/ipv6/ila/ila_xlat.c @@ -0,0 +1,680 @@ +#include <linux/jhash.h> +#include <linux/netfilter.h> +#include <linux/rcupdate.h> +#include <linux/rhashtable.h> +#include <linux/vmalloc.h> +#include <net/genetlink.h> +#include <net/ila.h> +#include <net/netns/generic.h> +#include <uapi/linux/genetlink.h> +#include "ila.h" + +struct ila_xlat_params { + struct ila_params ip; + __be64 identifier; + int ifindex; + unsigned int dir; +}; + +struct ila_map { + struct ila_xlat_params p; + struct rhash_head node; + struct ila_map __rcu *next; + struct rcu_head rcu; +}; + +static unsigned int ila_net_id; + +struct ila_net { + struct rhashtable rhash_table; + spinlock_t *locks; /* Bucket locks for entry manipulation */ + unsigned int locks_mask; + bool hooks_registered; +}; + +#define LOCKS_PER_CPU 10 + +static int alloc_ila_locks(struct ila_net *ilan) +{ + unsigned int i, size; + unsigned int nr_pcpus = num_possible_cpus(); + + nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); + size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); + + if (sizeof(spinlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(spinlock_t) > PAGE_SIZE) + ilan->locks = vmalloc(size * sizeof(spinlock_t)); + else +#endif + ilan->locks = kmalloc_array(size, sizeof(spinlock_t), + GFP_KERNEL); + if (!ilan->locks) + return -ENOMEM; + for (i = 0; i < size; i++) + spin_lock_init(&ilan->locks[i]); + } + ilan->locks_mask = size - 1; + + return 0; +} + +static u32 hashrnd __read_mostly; +static __always_inline void __ila_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static inline u32 ila_identifier_hash(__be64 identifier) +{ + u32 *v = (u32 *)&identifier; + + return jhash_2words(v[0], v[1], hashrnd); +} + +static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier) +{ + return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask]; +} + +static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc, + int ifindex, unsigned int dir) +{ + return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) || + (ila->p.ifindex && ila->p.ifindex != ifindex) || + !(ila->p.dir & dir); +} + +static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p) +{ + return (ila->p.ip.locator_match != p->ip.locator_match) || + (ila->p.ifindex != p->ifindex) || + (ila->p.dir != p->dir); +} + +static int ila_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct ila_map *ila = obj; + + return (ila->p.identifier != *(__be64 *)arg->key); +} + +static inline int ila_order(struct ila_map *ila) +{ + int score = 0; + + if (ila->p.ip.locator_match) + score += 1 << 0; + + if (ila->p.ifindex) + score += 1 << 1; + + return score; +} + +static const struct rhashtable_params rht_params = { + .nelem_hint = 1024, + .head_offset = offsetof(struct ila_map, node), + .key_offset = offsetof(struct ila_map, p.identifier), + .key_len = sizeof(u64), /* identifier */ + .max_size = 1048576, + .min_size = 256, + .automatic_shrinking = true, + .obj_cmpfn = ila_cmpfn, +}; + +static struct genl_family ila_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, + [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, + [ILA_ATTR_DIR] = { .type = NLA_U32, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct ila_xlat_params *p) +{ + memset(p, 0, sizeof(*p)); + + if (info->attrs[ILA_ATTR_IDENTIFIER]) + p->identifier = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_IDENTIFIER]); + + if (info->attrs[ILA_ATTR_LOCATOR]) + p->ip.locator = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_LOCATOR]); + + if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) + p->ip.locator_match = (__force __be64)nla_get_u64( + info->attrs[ILA_ATTR_LOCATOR_MATCH]); + + if (info->attrs[ILA_ATTR_IFINDEX]) + p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); + + if (info->attrs[ILA_ATTR_DIR]) + p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]); + + return 0; +} + +/* Must be called with rcu readlock */ +static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, + int ifindex, + unsigned int dir, + struct ila_net *ilan) +{ + struct ila_map *ila; + + ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params); + while (ila) { + if (!ila_cmp_wildcards(ila, loc, ifindex, dir)) + return ila; + ila = rcu_access_pointer(ila->next); + } + + return NULL; +} + +/* Must be called with rcu readlock */ +static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p, + struct ila_net *ilan) +{ + struct ila_map *ila; + + ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + rht_params); + while (ila) { + if (!ila_cmp_params(ila, p)) + return ila; + ila = rcu_access_pointer(ila->next); + } + + return NULL; +} + +static inline void ila_release(struct ila_map *ila) +{ + kfree_rcu(ila, rcu); +} + +static void ila_free_cb(void *ptr, void *arg) +{ + struct ila_map *ila = (struct ila_map *)ptr, *next; + + /* Assume rcu_readlock held */ + while (ila) { + next = rcu_access_pointer(ila->next); + ila_release(ila); + ila = next; + } +} + +static int ila_xlat_addr(struct sk_buff *skb, int dir); + +static unsigned int +ila_nf_input(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + ila_xlat_addr(skb, ILA_DIR_IN); + return NF_ACCEPT; +} + +static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { + { + .hook = ila_nf_input, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = -1, + }, +}; + +static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_map *ila, *head; + spinlock_t *lock = ila_get_lock(ilan, p->identifier); + int err = 0, order; + + if (!ilan->hooks_registered) { + /* We defer registering net hooks in the namespace until the + * first mapping is added. + */ + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (err) + return err; + + ilan->hooks_registered = true; + } + + ila = kzalloc(sizeof(*ila), GFP_KERNEL); + if (!ila) + return -ENOMEM; + + ila->p = *p; + + if (p->ip.locator_match) { + /* Precompute checksum difference for translation since we + * know both the old identifier and the new one. + */ + ila->p.ip.csum_diff = compute_csum_diff8( + (__be32 *)&p->ip.locator_match, + (__be32 *)&p->ip.locator); + } + + order = ila_order(ila); + + spin_lock(lock); + + head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + rht_params); + if (!head) { + /* New entry for the rhash_table */ + err = rhashtable_lookup_insert_fast(&ilan->rhash_table, + &ila->node, rht_params); + } else { + struct ila_map *tila = head, *prev = NULL; + + do { + if (!ila_cmp_params(tila, p)) { + err = -EEXIST; + goto out; + } + + if (order > ila_order(tila)) + break; + + prev = tila; + tila = rcu_dereference_protected(tila->next, + lockdep_is_held(lock)); + } while (tila); + + if (prev) { + /* Insert in sub list of head */ + RCU_INIT_POINTER(ila->next, tila); + rcu_assign_pointer(prev->next, ila); + } else { + /* Make this ila new head */ + RCU_INIT_POINTER(ila->next, head); + err = rhashtable_replace_fast(&ilan->rhash_table, + &head->node, + &ila->node, rht_params); + if (err) + goto out; + } + } + +out: + spin_unlock(lock); + + if (err) + kfree(ila); + + return err; +} + +static int ila_del_mapping(struct net *net, struct ila_xlat_params *p) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_map *ila, *head, *prev; + spinlock_t *lock = ila_get_lock(ilan, p->identifier); + int err = -ENOENT; + + spin_lock(lock); + + head = rhashtable_lookup_fast(&ilan->rhash_table, + &p->identifier, rht_params); + ila = head; + + prev = NULL; + + while (ila) { + if (ila_cmp_params(ila, p)) { + prev = ila; + ila = rcu_dereference_protected(ila->next, + lockdep_is_held(lock)); + continue; + } + + err = 0; + + if (prev) { + /* Not head, just delete from list */ + rcu_assign_pointer(prev->next, ila->next); + } else { + /* It is the head. If there is something in the + * sublist we need to make a new head. + */ + head = rcu_dereference_protected(ila->next, + lockdep_is_held(lock)); + if (head) { + /* Put first entry in the sublist into the + * table + */ + err = rhashtable_replace_fast( + &ilan->rhash_table, &ila->node, + &head->node, rht_params); + if (err) + goto out; + } else { + /* Entry no longer used */ + err = rhashtable_remove_fast(&ilan->rhash_table, + &ila->node, + rht_params); + } + } + + ila_release(ila); + + break; + } + +out: + spin_unlock(lock); + + return err; +} + +static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_xlat_params p; + int err; + + err = parse_nl_config(info, &p); + if (err) + return err; + + return ila_add_mapping(net, &p); +} + +static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_xlat_params p; + int err; + + err = parse_nl_config(info, &p); + if (err) + return err; + + ila_del_mapping(net, &p); + + return 0; +} + +static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) +{ + if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER, + (__force u64)ila->p.identifier) || + nla_put_u64(msg, ILA_ATTR_LOCATOR, + (__force u64)ila->p.ip.locator) || + nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH, + (__force u64)ila->p.ip.locator_match) || + nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) || + nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir)) + return -1; + + return 0; +} + +static int ila_dump_info(struct ila_map *ila, + u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (ila_fill_info(ila, skb) < 0) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct sk_buff *msg; + struct ila_xlat_params p; + struct ila_map *ila; + int ret; + + ret = parse_nl_config(info, &p); + if (ret) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rcu_read_lock(); + + ila = ila_lookup_by_params(&p, ilan); + if (ila) { + ret = ila_dump_info(ila, + info->snd_portid, + info->snd_seq, 0, msg, + info->genlhdr->cmd); + } + + rcu_read_unlock(); + + if (ret < 0) + goto out_free; + + return genlmsg_reply(msg, info); + +out_free: + nlmsg_free(msg); + return ret; +} + +struct ila_dump_iter { + struct rhashtable_iter rhiter; +}; + +static int ila_nl_dump_start(struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + + return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter); +} + +static int ila_nl_dump_done(struct netlink_callback *cb) +{ + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + + rhashtable_walk_exit(&iter->rhiter); + + return 0; +} + +static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; + struct rhashtable_iter *rhiter = &iter->rhiter; + struct ila_map *ila; + int ret; + + ret = rhashtable_walk_start(rhiter); + if (ret && ret != -EAGAIN) + goto done; + + for (;;) { + ila = rhashtable_walk_next(rhiter); + + if (IS_ERR(ila)) { + if (PTR_ERR(ila) == -EAGAIN) + continue; + ret = PTR_ERR(ila); + goto done; + } else if (!ila) { + break; + } + + while (ila) { + ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, ILA_CMD_GET); + if (ret) + goto done; + + ila = rcu_access_pointer(ila->next); + } + } + + ret = skb->len; + +done: + rhashtable_walk_stop(rhiter); + return ret; +} + +static const struct genl_ops ila_nl_ops[] = { + { + .cmd = ILA_CMD_ADD, + .doit = ila_nl_cmd_add_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_DEL, + .doit = ila_nl_cmd_del_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_GET, + .doit = ila_nl_cmd_get_mapping, + .start = ila_nl_dump_start, + .dumpit = ila_nl_dump, + .done = ila_nl_dump_done, + .policy = ila_nl_policy, + }, +}; + +#define ILA_HASH_TABLE_SIZE 1024 + +static __net_init int ila_init_net(struct net *net) +{ + int err; + struct ila_net *ilan = net_generic(net, ila_net_id); + + err = alloc_ila_locks(ilan); + if (err) + return err; + + rhashtable_init(&ilan->rhash_table, &rht_params); + + return 0; +} + +static __net_exit void ila_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); + + kvfree(ilan->locks); + + if (ilan->hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + +static struct pernet_operations ila_net_ops = { + .init = ila_init_net, + .exit = ila_exit_net, + .id = &ila_net_id, + .size = sizeof(struct ila_net), +}; + +static int ila_xlat_addr(struct sk_buff *skb, int dir) +{ + struct ila_map *ila; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + struct ila_net *ilan = net_generic(net, ila_net_id); + __be64 identifier, locator_match; + size_t nhoff; + + /* Assumes skb contains a valid IPv6 header that is pulled */ + + identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8]; + locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0]; + nhoff = sizeof(struct ipv6hdr); + + rcu_read_lock(); + + ila = ila_lookup_wildcards(identifier, locator_match, + skb->dev->ifindex, dir, ilan); + if (ila) + update_ipv6_locator(skb, &ila->p.ip); + + rcu_read_unlock(); + + return 0; +} + +int ila_xlat_incoming(struct sk_buff *skb) +{ + return ila_xlat_addr(skb, ILA_DIR_IN); +} +EXPORT_SYMBOL(ila_xlat_incoming); + +int ila_xlat_outgoing(struct sk_buff *skb) +{ + return ila_xlat_addr(skb, ILA_DIR_OUT); +} +EXPORT_SYMBOL(ila_xlat_outgoing); + +int ila_xlat_init(void) +{ + int ret; + + ret = register_pernet_device(&ila_net_ops); + if (ret) + goto exit; + + ret = genl_register_family_with_ops(&ila_nl_family, + ila_nl_ops); + if (ret < 0) + goto unregister; + + return 0; + +unregister: + unregister_pernet_device(&ila_net_ops); +exit: + return ret; +} + +void ila_xlat_fini(void) +{ + genl_unregister_family(&ila_nl_family); + unregister_pernet_device(&ila_net_ops); +} diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5d1c7cee2cb2..36c3f0155010 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -51,12 +51,12 @@ int inet6_csk_bind_conflict(const struct sock *sk, (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2)) + if (ipv6_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2)) + ipv6_rcv_saddr_equal(sk, sk2, true)) break; } } @@ -78,7 +78,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; fl6->daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); fl6->saddr = ireq->ir_v6_loc_addr; fl6->flowi6_oif = ireq->ir_iif; fl6->flowi6_mark = ireq->ir_mark; @@ -109,14 +111,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline -void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - __ip6_dst_store(sk, dst, daddr, saddr); -} - -static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { return __sk_dst_check(sk, cookie); @@ -142,14 +136,16 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_dport = inet->inet_dport; security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); dst = __inet6_csk_dst_check(sk, np->dst_cookie); if (!dst) { dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) - __inet6_csk_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return dst; } @@ -175,7 +171,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3c7b9310b33f..f37f18b6b40c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -24,7 +24,6 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> -#include <linux/mroute.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> @@ -1571,13 +1570,11 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], return -EEXIST; } else { t = nt; - - ip6gre_tunnel_unlink(ign, t); - ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); - ip6gre_tunnel_link(ign, t); - netdev_state_change(dev); } + ip6gre_tunnel_unlink(ign, t); + ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); + ip6gre_tunnel_link(ign, t); return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e6a7bd15b9b7..23de98f976d5 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1322,7 +1322,7 @@ emsgsize: headersize == sizeof(struct ipv6hdr) && length < mtu - headersize && !(flags & MSG_MORE) && - rt->dst.dev->features & NETIF_F_V6_CSUM) + rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { @@ -1353,7 +1353,7 @@ emsgsize: (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && - (sk->sk_type == SOCK_DGRAM)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags, fl6); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index eabffbb89795..137fca42aaa6 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -177,7 +177,7 @@ void ip6_tnl_dst_reset(struct ip6_tnl *t) int i; for_each_possible_cpu(i) - ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); + ip6_tnl_per_cpu_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index ad19136086dd..a10e77103c88 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -118,7 +118,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, int cmd); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr6_table *mrt); +static void mroute_clean_tables(struct mr6_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES @@ -334,7 +334,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr6_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, true); kfree(mrt); } @@ -765,10 +765,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) return dev; failure: - /* allow the register to be completed before unregistering. */ - rtnl_unlock(); - rtnl_lock(); - unregister_netdevice(dev); return NULL; } @@ -1542,7 +1538,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr6_table *mrt) +static void mroute_clean_tables(struct mr6_table *mrt, bool all) { int i; LIST_HEAD(list); @@ -1552,8 +1548,9 @@ static void mroute_clean_tables(struct mr6_table *mrt) * Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) - mif6_delete(mrt, i, &list); + if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) + continue; + mif6_delete(mrt, i, &list); } unregister_netdevice_many(&list); @@ -1562,7 +1559,7 @@ static void mroute_clean_tables(struct mr6_table *mrt) */ for (i = 0; i < MFC6_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { - if (c->mfc_flags & MFC_STATIC) + if (!all && (c->mfc_flags & MFC_STATIC)) continue; write_lock_bh(&mrt_lock); list_del(&c->list); @@ -1625,7 +1622,7 @@ int ip6mr_sk_done(struct sock *sk) net->ipv6.devconf_all); write_unlock_bh(&mrt_lock); - mroute_clean_tables(mrt); + mroute_clean_tables(mrt, false); err = 0; break; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 63e6956917c9..4449ad1f8114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } - opt = xchg(&inet6_sk(sk)->opt, opt); + opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, + opt); sk_dst_reset(sk); return opt; @@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg(&np->opt, NULL); - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + opt = xchg((__force struct ipv6_txoptions **)&np->opt, + NULL); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); @@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = ipv6_renew_options(sk, np->opt, optname, + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); if (IS_ERR(opt)) { @@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; opt = ipv6_update_options(sk, opt); sticky_done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } @@ -486,6 +493,7 @@ sticky_done: break; memset(opt, 0, sizeof(*opt)); + atomic_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_user(opt+1, optval, optlen)) @@ -502,8 +510,10 @@ update: retv = 0; opt = ipv6_update_options(sk, opt); done: - if (opt) - sock_kfree_s(sk, opt, opt->tot_len); + if (opt) { + atomic_sub(opt->tot_len, &sk->sk_omem_alloc); + txopt_put(opt); + } break; } case IPV6_UNICAST_HOPS: @@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, case IPV6_RTHDR: case IPV6_DSTOPTS: { + struct ipv6_txoptions *opt; lock_sock(sk); - len = ipv6_getsockopt_sticky(sk, np->opt, - optname, optval, len); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 124338a39e29..5ee56d0a8699 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1651,7 +1651,6 @@ out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); } else { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); } @@ -2015,7 +2014,6 @@ out: if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); - IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); } else IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3e0f855e1bea..84afb9a77278 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -556,8 +556,7 @@ static void ndisc_send_unsol_na(struct net_device *dev) } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr, - struct sk_buff *oskb) + const struct in6_addr *daddr, const struct in6_addr *saddr) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -593,9 +592,6 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); - if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) - skb_dst_copy(skb, oskb); - ndisc_send_skb(skb, daddr, saddr); } @@ -682,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, target, target, saddr, skb); + ndisc_send_ns(dev, target, target, saddr); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, target, &mcaddr, saddr, skb); + ndisc_send_ns(dev, target, &mcaddr, saddr); } } @@ -1187,7 +1183,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) */ if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1341,7 +1337,7 @@ skip_linkparms: #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index f6a024e141e5..e10a04c9cdc7 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -49,6 +49,7 @@ config NFT_REJECT_IPV6 config NFT_DUP_IPV6 tristate "IPv6 nf_tables packet duplication support" + depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV6 help This module enables IPv6 packet duplication support for nf_tables. diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d5efeb87350e..e4347aeb2e65 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -56,7 +56,6 @@ struct nf_ct_frag6_skb_cb { struct inet6_skb_parm h; int offset; - struct sk_buff *orig; }; #define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) @@ -170,12 +169,6 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q) return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); } -static void nf_skb_free(struct sk_buff *skb) -{ - if (NFCT_FRAG6_CB(skb)->orig) - kfree_skb(NFCT_FRAG6_CB(skb)->orig); -} - static void nf_ct_frag6_expire(unsigned long data) { struct frag_queue *fq; @@ -190,7 +183,7 @@ static void nf_ct_frag6_expire(unsigned long data) /* Creation primitives. */ static inline struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, - struct in6_addr *dst, u8 ecn) + struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -200,6 +193,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id, arg.user = user; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; local_bh_disable(); @@ -368,17 +362,18 @@ err: /* * Check if this packet is complete. - * Returns NULL on failure by any reason, and pointer - * to current nexthdr field in reassembled frame. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. + * + * returns true if *prev skb has been transformed into the reassembled + * skb, false otherwise. */ -static struct sk_buff * -nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) +static bool +nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) { - struct sk_buff *fp, *op, *head = fq->q.fragments; + struct sk_buff *fp, *head = fq->q.fragments; int payload_len; u8 ecn; @@ -389,22 +384,21 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) - goto out_fail; + return false; /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) { - pr_debug("payload len is too large.\n"); - goto out_oversize; + net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", + payload_len); + return false; } /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) { - pr_debug("skb is cloned but can't expand head"); - goto out_oom; - } + if (skb_unclone(head, GFP_ATOMIC)) + return false; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part @@ -415,7 +409,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone = alloc_skb(0, GFP_ATOMIC); if (clone == NULL) - goto out_oom; + return false; clone->next = head->next; head->next = clone; @@ -429,10 +423,41 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) clone->csum = 0; clone->ip_summed = head->ip_summed; - NFCT_FRAG6_CB(clone)->orig = NULL; add_frag_mem_limit(fq->q.net, clone->truesize); } + /* morph head into last received skb: prev. + * + * This allows callers of ipv6 conntrack defrag to continue + * to use the last skb(frag) passed into the reasm engine. + * The last skb frag 'silently' turns into the full reassembled skb. + * + * Since prev is also part of q->fragments we have to clone it first. + */ + if (head != prev) { + struct sk_buff *iter; + + fp = skb_clone(prev, GFP_ATOMIC); + if (!fp) + return false; + + fp->next = prev->next; + + iter = head; + while (iter) { + if (iter->next == prev) { + iter->next = fp; + break; + } + iter = iter->next; + } + + skb_morph(prev, head); + prev->next = head->next; + consume_skb(head); + head = prev; + } + /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; @@ -473,31 +498,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev) fq->q.fragments = NULL; fq->q.fragments_tail = NULL; - /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ - fp = skb_shinfo(head)->frag_list; - if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) - /* at above code, head skb is divided into two skbs. */ - fp = fp->next; - - op = NFCT_FRAG6_CB(head)->orig; - for (; fp; fp = fp->next) { - struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; - - op->next = orig; - op = orig; - NFCT_FRAG6_CB(fp)->orig = NULL; - } - - return head; - -out_oversize: - net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", - payload_len); - goto out_fail; -out_oom: - net_dbg_ratelimited("nf_ct_frag6_reasm: no memory for reassembly\n"); -out_fail: - return NULL; + return true; } /* @@ -563,89 +564,61 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) return 0; } -struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) +int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { - struct sk_buff *clone; struct net_device *dev = skb->dev; + int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; - int fhoff, nhoff; u8 prevhdr; - struct sk_buff *ret_skb = NULL; /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { pr_debug("payload len = 0\n"); - return skb; + return -EINVAL; } if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) - return skb; + return -EINVAL; - clone = skb_clone(skb, GFP_ATOMIC); - if (clone == NULL) { - pr_debug("Can't clone skb\n"); - return skb; - } + if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) + return -ENOMEM; - NFCT_FRAG6_CB(clone)->orig = skb; - - if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { - pr_debug("message is too short.\n"); - goto ret_orig; - } - - skb_set_transport_header(clone, fhoff); - hdr = ipv6_hdr(clone); - fhdr = (struct frag_hdr *)skb_transport_header(clone); + skb_set_transport_header(skb, fhoff); + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); - goto ret_orig; + return -ENOMEM; } spin_lock_bh(&fq->q.lock); - if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { - spin_unlock_bh(&fq->q.lock); - pr_debug("Can't insert skb to queue\n"); - inet_frag_put(&fq->q, &nf_frags); - goto ret_orig; + if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) { + ret = -EINVAL; + goto out_unlock; } + /* after queue has assumed skb ownership, only 0 or -EINPROGRESS + * must be returned. + */ + ret = -EINPROGRESS; if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len) { - ret_skb = nf_ct_frag6_reasm(fq, dev); - if (ret_skb == NULL) - pr_debug("Can't reassemble fragmented packets\n"); - } - spin_unlock_bh(&fq->q.lock); + fq->q.meat == fq->q.len && + nf_ct_frag6_reasm(fq, skb, dev)) + ret = 0; +out_unlock: + spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q, &nf_frags); - return ret_skb; - -ret_orig: - kfree_skb(clone); - return skb; + return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); -void nf_ct_frag6_consume_orig(struct sk_buff *skb) -{ - struct sk_buff *s, *s2; - - for (s = NFCT_FRAG6_CB(skb)->orig; s;) { - s2 = s->next; - s->next = NULL; - consume_skb(s); - s = s2; - } -} -EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig); - static int nf_ct_net_init(struct net *net) { int res; @@ -680,7 +653,6 @@ int nf_ct_frag6_init(void) nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; - nf_frags.skb_free = nf_skb_free; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 4fdbed5ebfb6..f7aab5ab93a5 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -55,7 +55,7 @@ static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - struct sk_buff *reasm; + int err; #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ @@ -63,23 +63,13 @@ static unsigned int ipv6_defrag(void *priv, return NF_ACCEPT; #endif - reasm = nf_ct_frag6_gather(state->net, skb, - nf_ct6_defrag_user(state->hook, skb)); + err = nf_ct_frag6_gather(state->net, skb, + nf_ct6_defrag_user(state->hook, skb)); /* queued */ - if (reasm == NULL) + if (err == -EINPROGRESS) return NF_STOLEN; - /* error occurred or not fragmented */ - if (reasm == skb) - return NF_ACCEPT; - - nf_ct_frag6_consume_orig(reasm); - - NF_HOOK_THRESH(NFPROTO_IPV6, state->hook, state->net, state->sk, reasm, - state->in, state->out, - state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1); - - return NF_STOLEN; + return NF_ACCEPT; } static struct nf_hook_ops ipv6_defrag_ops[] = { diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 238e70c3f7b7..6ce309928841 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -136,7 +136,8 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt6i_flags & RTF_LOCAL) && - (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) { + (!skb->dev || skb->dev->features & + (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index e0f922b777e3..4709f657b7b6 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -14,7 +14,6 @@ #include <net/netfilter/ipv6/nf_reject.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> -#include <net/netfilter/ipv6/nf_reject.h> const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 120ea9131be0..30b22f4dff55 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -77,7 +77,7 @@ err: static void nf_tables_ipv6_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.ipv6); + nft_unregister_afinfo(net, net->nft.ipv6); kfree(net->nft.ipv6); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index dc65ec198f7c..fa59dd7a427e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -733,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; @@ -839,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -906,6 +909,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: dst_confirm(dst); @@ -968,6 +972,11 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { + case IPV6_HDRINCL: + if (sk->sk_type != SOCK_RAW) + return -EINVAL; + inet_sk(sk)->hdrincl = !!val; + return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && level == IPPROTO_IPV6) { @@ -1012,7 +1021,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1033,7 +1043,8 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return compat_ipv6_setsockopt(sk, level, optname, @@ -1053,6 +1064,9 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; switch (optname) { + case IPV6_HDRINCL: + val = inet_sk(sk)->hdrincl; + break; case IPV6_CHECKSUM: /* * We allow getsockopt() for IPPROTO_IPV6-level @@ -1090,7 +1104,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return ipv6_getsockopt(sk, level, optname, optval, optlen); @@ -1111,7 +1126,8 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: - if (optname == IPV6_CHECKSUM) + if (optname == IPV6_CHECKSUM || + optname == IPV6_HDRINCL) break; default: return compat_ipv6_getsockopt(sk, level, optname, diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 44e21a03cfc3..18f3498a6c80 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) return fq->id == arg->id && fq->user == arg->user && ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst); + ipv6_addr_equal(&fq->daddr, arg->dst) && + (arg->iif == fq->iif || + !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))); } EXPORT_SYMBOL(ip6_frag_match); @@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data) static struct frag_queue * fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, u8 ecn) + const struct in6_addr *dst, int iif, u8 ecn) { struct inet_frag_queue *q; struct ip6_create_arg arg; @@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src, arg.user = IP6_DEFRAG_LOCAL_DELIVER; arg.src = src; arg.dst = dst; + arg.iif = iif; arg.ecn = ecn; hash = inet6_hash_frag(id, src, dst); @@ -551,7 +555,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); if (fq) { int ret; @@ -751,7 +755,6 @@ int __init ipv6_frag_init(void) ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; ip6_frags.destructor = NULL; - ip6_frags.skb_free = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c8bc9b4ac328..3c8834bc822d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -62,6 +62,7 @@ #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> +#include <trace/events/fib6.h> #include <asm/uaccess.h> @@ -404,6 +405,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, } } +static bool __rt6_check_expired(const struct rt6_info *rt) +{ + if (rt->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, rt->dst.expires); + else + return false; +} + static bool rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) { @@ -515,7 +524,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); dev_put(work->dev); kfree(work); } @@ -857,6 +866,9 @@ restart: } dst_use(&rt->dst, jiffies); read_unlock_bh(&table->tb6_lock); + + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; } @@ -1070,6 +1082,8 @@ redo_rt6_select: read_unlock_bh(&table->tb6_lock); rt6_dst_from_metrics_check(rt); + + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(rt->rt6i_flags & RTF_GATEWAY))) { @@ -1093,6 +1107,8 @@ redo_rt6_select: uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); + + trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; } else { @@ -1117,6 +1133,7 @@ redo_rt6_select: dst_release(&rt->dst); } + trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; } @@ -1252,7 +1269,8 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { - if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + if (!__rt6_check_expired(rt) && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && rt6_check((struct rt6_info *)(rt->dst.from), cookie)) return &rt->dst; else @@ -1272,7 +1290,8 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt6_dst_from_metrics_check(rt); - if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) + if (rt->rt6i_flags & RTF_PCPU || + (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from)) return rt6_dst_from_check(rt, cookie); else return rt6_check(rt, cookie); @@ -1322,6 +1341,12 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } +static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) +{ + return !(rt->rt6i_flags & RTF_CACHE) && + (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); +} + static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu) { @@ -1335,7 +1360,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (mtu >= dst_mtu(dst)) return; - if (rt6->rt6i_flags & RTF_CACHE) { + if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); } else { const struct in6_addr *daddr, *saddr; @@ -1458,6 +1483,7 @@ out: read_unlock_bh(&table->tb6_lock); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; }; @@ -2683,6 +2709,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_EXPIRES] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2783,6 +2810,15 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + if (tb[RTA_EXPIRES]) { + unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); + + if (addrconf_finite_timeout(timeout)) { + cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); + cfg->fc_flags |= RTF_EXPIRES; + } + } + err = 0; errout: return err; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index dcccae86190f..e794ef66a401 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -820,7 +820,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, const struct in6_addr *addr6; int addr_type; u8 ttl; - int err; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); @@ -983,10 +982,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); - err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, - protocol, tos, ttl, df, - !net_eq(tunnel->net, dev_net(dev))); - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev))); return NETDEV_TX_OK; tx_error_icmp: diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bb8f2fa1c7fb..2906ef20795e 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->pktopts = skb; } - ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_iif = inet_request_bound_dev_if(sk, skb); /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) @@ -222,9 +222,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; - final_p = fl6_update_dst(&fl6, np->opt, &final); + final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_oif = ireq->ir_iif; fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5baa8e754e41..006396e31cb0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,7 +61,6 @@ #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> -#include <net/tcp_memcontrol.h> #include <net/busy_poll.h> #include <linux/proc_fs.h> @@ -93,10 +92,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { + if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); @@ -120,6 +118,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; + struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; @@ -235,7 +234,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - final_p = fl6_update_dst(&fl6, np->opt, &final); + opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -255,7 +255,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && @@ -263,9 +263,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_fetch_timewait_stamp(sk, dst); icsk->icsk_ext_hdr_len = 0; - if (np->opt) - icsk->icsk_ext_hdr_len = (np->opt->opt_flen + - np->opt->opt_nflen); + if (opt) + icsk->icsk_ext_hdr_len = opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -461,7 +461,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + rcu_read_lock(); + err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), + np->tclass); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -852,7 +855,9 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG hash_location = tcp_parse_md5sig_option(th); - if (!sk && hash_location) { + if (sk && sk_fullsock(sk)) { + key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); + } else if (hash_location) { /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -875,8 +880,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; - } else { - key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL; } #endif @@ -972,6 +975,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; @@ -1056,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; @@ -1098,13 +1102,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - if (np->opt) - newnp->opt = ipv6_dup_options(newsk, np->opt); - + opt = rcu_dereference(np->opt); + if (opt) { + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + } inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newnp->opt) - inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + - newnp->opt->opt_flen); + if (opt) + inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); @@ -1130,7 +1136,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, AF_INET6, key->key, key->keylen, - sk_gfp_atomic(sk, GFP_ATOMIC)); + sk_gfp_mask(sk, GFP_ATOMIC)); } #endif @@ -1146,7 +1152,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_atomic(sk, GFP_ATOMIC)); + sk_gfp_mask(sk, GFP_ATOMIC)); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) @@ -1212,7 +1218,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); + opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; @@ -1511,7 +1517,9 @@ do_time_wait: break; case TCP_TW_RST: tcp_v6_restore_cb(skb); - goto no_tcp_socket; + tcp_v6_send_reset(sk, skb); + inet_twsk_deschedule_put(inet_twsk(sk)); + goto discard_it; case TCP_TW_SUCCESS: ; } @@ -1690,6 +1698,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + int rx_queue; + int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; @@ -1710,6 +1720,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_expires = jiffies; } + state = sk_state_load(sp); + if (state == TCP_LISTEN) + rx_queue = sp->sk_ack_backlog; + else + /* Because we don't lock the socket, + * we might find a transient negative value. + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", @@ -1718,9 +1737,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - sp->sk_state, - tp->write_seq-tp->snd_una, - (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + state, + tp->write_seq - tp->snd_una, + rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -1732,7 +1751,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, - sp->sk_state == TCP_LISTEN ? + state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); @@ -1869,10 +1888,8 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, -#endif .clear_sk = tcp_v6_clear_sk, + .diag_destroy = tcp_abort, }; static const struct inet6_protocol tcpv6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 01bcb49619ee..5d2c2afffe7b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -47,6 +47,7 @@ #include <net/xfrm.h> #include <net/inet6_hashtables.h> #include <net/busy_poll.h> +#include <net/sock_reuseport.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -76,7 +77,14 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); int sk2_ipv6only = inet_v6_ipv6only(sk2); @@ -84,16 +92,24 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) - return (!sk2_ipv6only && - (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || - sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } - if (addr_type2 == IPV6_ADDR_ANY && + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return 1; - if (addr_type == IPV6_ADDR_ANY && + if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; @@ -235,7 +251,8 @@ static inline int compute_score2(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2) + struct udp_hslot *hslot2, unsigned int slot2, + struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -253,8 +270,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -273,6 +297,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -287,7 +312,8 @@ begin: struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, + struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -307,7 +333,7 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); if (!result) { hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; @@ -317,7 +343,7 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, - hslot2, slot2); + hslot2, slot2, skb); } rcu_read_unlock(); return result; @@ -332,8 +358,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -352,6 +385,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, hnum, saddr, sport, @@ -377,13 +411,13 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, return sk; return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - udptable); + udptable, skb); } struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp6_lib_lookup); @@ -402,6 +436,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int peeked, off = 0; int err; int is_udplite = IS_UDPLITE(sk); + bool checksum_valid = false; int is_udp4; bool slow; @@ -433,11 +468,12 @@ try_again: */ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { - if (udp_lib_checksum_complete(skb)) + checksum_valid = !udp_lib_checksum_complete(skb); + if (!checksum_valid) goto csum_copy_err; } - if (skb_csum_unnecessary(skb)) + if (checksum_valid || skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), msg, copied); else { @@ -547,8 +583,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int err; struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(net, daddr, uh->dest, - saddr, uh->source, inet6_iif(skb), udptable); + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, + inet6_iif(skb), udptable, skb); if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -1110,6 +1146,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_txoptions *opt = NULL; + struct ipv6_txoptions *opt_to_free = NULL; struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; @@ -1263,8 +1300,10 @@ do_udp_sendmsg: opt = NULL; connected = 0; } - if (!opt) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -1373,6 +1412,7 @@ release_dst: out: dst_release(dst); fl6_sock_release(flowlabel); + txopt_put(opt_to_free); if (!err) return len; /* diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index f7fbdbabe50e..372855eeaf42 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -23,7 +23,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP6_ECN_set_ce(inner_iph); + IP6_ECN_set_ce(skb, inner_iph); } /* Add encapsulation header. diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 5643423fe67a..c074771a10f7 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -279,7 +279,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm6_dst_ops = { +static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, @@ -293,7 +293,7 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .dst_ops = &xfrm6_dst_ops, + .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, @@ -325,7 +325,7 @@ static struct ctl_table xfrm6_policy_table[] = { { } }; -static int __net_init xfrm6_net_init(struct net *net) +static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -353,7 +353,7 @@ err_alloc: return -ENOMEM; } -static void __net_exit xfrm6_net_exit(struct net *net) +static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -365,24 +365,52 @@ static void __net_exit xfrm6_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm6_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm6_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm6_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, + sizeof(xfrm6_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); + if (ret) + return ret; + + ret = xfrm6_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); + + return ret; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + xfrm6_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); +} static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; -#endif int __init xfrm6_init(void) { int ret; - dst_entries_init(&xfrm6_dst_ops); - ret = xfrm6_policy_init(); - if (ret) { - dst_entries_destroy(&xfrm6_dst_ops); + if (ret) goto out; - } ret = xfrm6_state_init(); if (ret) goto out_policy; @@ -391,9 +419,7 @@ int __init xfrm6_init(void) if (ret) goto out_state; -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm6_net_ops); -#endif out: return ret; out_state: @@ -405,11 +431,8 @@ out_policy: void xfrm6_fini(void) { -#ifdef CONFIG_SYSCTL unregister_pernet_subsys(&xfrm6_net_ops); -#endif xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); - dst_entries_destroy(&xfrm6_dst_ops); } diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index e6aa48b5395c..923abd6b3064 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; struct irda_sock *self; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (net != &init_net) return -EAFNOSUPPORT; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fcb2752419c6..ef50a94d3eb7 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -303,7 +303,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); @@ -1031,7 +1031,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); struct sk_buff *skb; - struct iucv_message txmsg; + struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; int cmsg_done; long timeo; @@ -1483,7 +1483,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && iucv_below_msglim(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } @@ -2084,11 +2084,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) return NET_RX_SUCCESS; } - /* write stuff from iucv_msg to skb cb */ - if (skb->len < sizeof(struct af_iucv_trans_hdr)) { - kfree_skb(skb); - return NET_RX_SUCCESS; - } + /* write stuff from iucv_msg to skb cb */ skb_pull(skb, sizeof(struct af_iucv_trans_hdr)); skb_reset_transport_header(skb); skb_reset_network_header(skb); @@ -2119,6 +2115,20 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, char nullstring[8]; int err = 0; + if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) { + WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d", + (int)skb->len, + (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr))); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) + if (skb_linearize(skb)) { + WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d", + (int)skb->len); + kfree_skb(skb); + return NET_RX_SUCCESS; + } skb_pull(skb, ETH_HLEN); trans_hdr = (struct af_iucv_trans_hdr *)skb->data; EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName)); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index aca38d8aed8e..a2c8747d2936 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -486,6 +486,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; @@ -575,8 +576,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = NULL; } - if (opt == NULL) - opt = np->opt; + if (!opt) { + opt = txopt_get(np); + opt_to_free = opt; + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -631,6 +634,7 @@ done: dst_release(dst); out: fl6_sock_release(flowlabel); + txopt_put(opt_to_free); return err < 0 ? err : len; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1ad18c55064c..652c250b9a3b 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -230,26 +230,11 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; + l2tp_dbg(session, PPPOL2TP_MSG_DATA, "%s: recv %d byte data frame, passing to ppp\n", session->name, data_len); - /* We need to forget all info related to the L2TP packet - * gathered in the skb as we are going to reuse the same - * skb for the inner packet. - * Namely we need to: - * - reset xfrm (IPSec) information as it applies to - * the outer L2TP packet and not to the inner one - * - release the dst to force a route lookup on the inner - * IP packet since skb->dst currently points to the dst - * of the UDP tunnel - * - reset netfilter information as it doesn't apply - * to the inner packet either - */ - secpath_reset(skb); - skb_dst_drop(skb); - nf_reset(skb); - po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { @@ -1862,5 +1847,5 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); -MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP)); +MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); MODULE_ALIAS_L2TP_PWTYPE(11); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index a758eb84e8f0..ff757181b0a8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -500,7 +500,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, start_seq_num, - local->hw.max_tx_aggregation_subframes, + IEEE80211_MAX_AMPDU_BUF, tid_tx->timeout); } @@ -926,6 +926,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; + buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); mutex_lock(&sta->ampdu_mlme.mtx); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c2bd1b6a6922..166a29fe6c35 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1169,8 +1169,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, * rc isn't initialized here yet, so ignore it */ __ieee80211_vht_handle_opmode(sdata, sta, - params->opmode_notif, - band, false); + params->opmode_notif, band); } if (ieee80211_vif_is_mesh(&sdata->vif)) @@ -1216,16 +1215,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (!sta) return -ENOMEM; - /* - * defaults -- if userspace wants something else we'll - * change it accordingly in sta_apply_parameters() - */ - if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) && - !(params->sta_flags_set & (BIT(NL80211_STA_FLAG_AUTHENTICATED) | - BIT(NL80211_STA_FLAG_ASSOCIATED)))) { - sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); - sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); - } if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) sta->sta.tdls = true; @@ -1994,6 +1983,11 @@ static int ieee80211_scan(struct wiphy *wiphy, return ieee80211_request_scan(sdata, req); } +static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev) +{ + ieee80211_scan_cancel(wiphy_priv(wiphy)); +} + static int ieee80211_sched_scan_start(struct wiphy *wiphy, struct net_device *dev, @@ -2509,294 +2503,6 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return 0; } -static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local, - struct ieee80211_roc_work *new_roc, - struct ieee80211_roc_work *cur_roc) -{ - unsigned long now = jiffies; - unsigned long remaining = cur_roc->hw_start_time + - msecs_to_jiffies(cur_roc->duration) - - now; - - if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun)) - return false; - - /* if it doesn't fit entirely, schedule a new one */ - if (new_roc->duration > jiffies_to_msecs(remaining)) - return false; - - ieee80211_handle_roc_started(new_roc); - - /* add to dependents so we send the expired event properly */ - list_add_tail(&new_roc->list, &cur_roc->dependents); - return true; -} - -static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) -{ - lockdep_assert_held(&local->mtx); - - local->roc_cookie_counter++; - - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(local->roc_cookie_counter == 0)) - local->roc_cookie_counter++; - - return local->roc_cookie_counter; -} - -static int ieee80211_start_roc_work(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel *channel, - unsigned int duration, u64 *cookie, - struct sk_buff *txskb, - enum ieee80211_roc_type type) -{ - struct ieee80211_roc_work *roc, *tmp; - bool queued = false; - int ret; - - lockdep_assert_held(&local->mtx); - - if (local->use_chanctx && !local->ops->remain_on_channel) - return -EOPNOTSUPP; - - roc = kzalloc(sizeof(*roc), GFP_KERNEL); - if (!roc) - return -ENOMEM; - - /* - * If the duration is zero, then the driver - * wouldn't actually do anything. Set it to - * 10 for now. - * - * TODO: cancel the off-channel operation - * when we get the SKB's TX status and - * the wait time was zero before. - */ - if (!duration) - duration = 10; - - roc->chan = channel; - roc->duration = duration; - roc->req_duration = duration; - roc->frame = txskb; - roc->type = type; - roc->sdata = sdata; - INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); - INIT_LIST_HEAD(&roc->dependents); - - /* - * cookie is either the roc cookie (for normal roc) - * or the SKB (for mgmt TX) - */ - if (!txskb) { - roc->cookie = ieee80211_mgmt_tx_cookie(local); - *cookie = roc->cookie; - } else { - roc->mgmt_tx_cookie = *cookie; - } - - /* if there's one pending or we're scanning, queue this one */ - if (!list_empty(&local->roc_list) || - local->scanning || ieee80211_is_radar_required(local)) - goto out_check_combine; - - /* if not HW assist, just queue & schedule work */ - if (!local->ops->remain_on_channel) { - ieee80211_queue_delayed_work(&local->hw, &roc->work, 0); - goto out_queue; - } - - /* otherwise actually kick it off here (for error handling) */ - - ret = drv_remain_on_channel(local, sdata, channel, duration, type); - if (ret) { - kfree(roc); - return ret; - } - - roc->started = true; - goto out_queue; - - out_check_combine: - list_for_each_entry(tmp, &local->roc_list, list) { - if (tmp->chan != channel || tmp->sdata != sdata) - continue; - - /* - * Extend this ROC if possible: - * - * If it hasn't started yet, just increase the duration - * and add the new one to the list of dependents. - * If the type of the new ROC has higher priority, modify the - * type of the previous one to match that of the new one. - */ - if (!tmp->started) { - list_add_tail(&roc->list, &tmp->dependents); - tmp->duration = max(tmp->duration, roc->duration); - tmp->type = max(tmp->type, roc->type); - queued = true; - break; - } - - /* If it has already started, it's more difficult ... */ - if (local->ops->remain_on_channel) { - /* - * In the offloaded ROC case, if it hasn't begun, add - * this new one to the dependent list to be handled - * when the master one begins. If it has begun, - * check if it fits entirely within the existing one, - * in which case it will just be dependent as well. - * Otherwise, schedule it by itself. - */ - if (!tmp->hw_begun) { - list_add_tail(&roc->list, &tmp->dependents); - queued = true; - break; - } - - if (ieee80211_coalesce_started_roc(local, roc, tmp)) - queued = true; - } else if (del_timer_sync(&tmp->work.timer)) { - unsigned long new_end; - - /* - * In the software ROC case, cancel the timer, if - * that fails then the finish work is already - * queued/pending and thus we queue the new ROC - * normally, if that succeeds then we can extend - * the timer duration and TX the frame (if any.) - */ - - list_add_tail(&roc->list, &tmp->dependents); - queued = true; - - new_end = jiffies + msecs_to_jiffies(roc->duration); - - /* ok, it was started & we canceled timer */ - if (time_after(new_end, tmp->work.timer.expires)) - mod_timer(&tmp->work.timer, new_end); - else - add_timer(&tmp->work.timer); - - ieee80211_handle_roc_started(roc); - } - break; - } - - out_queue: - if (!queued) - list_add_tail(&roc->list, &local->roc_list); - - return 0; -} - -static int ieee80211_remain_on_channel(struct wiphy *wiphy, - struct wireless_dev *wdev, - struct ieee80211_channel *chan, - unsigned int duration, - u64 *cookie) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct ieee80211_local *local = sdata->local; - int ret; - - mutex_lock(&local->mtx); - ret = ieee80211_start_roc_work(local, sdata, chan, - duration, cookie, NULL, - IEEE80211_ROC_TYPE_NORMAL); - mutex_unlock(&local->mtx); - - return ret; -} - -static int ieee80211_cancel_roc(struct ieee80211_local *local, - u64 cookie, bool mgmt_tx) -{ - struct ieee80211_roc_work *roc, *tmp, *found = NULL; - int ret; - - mutex_lock(&local->mtx); - list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { - struct ieee80211_roc_work *dep, *tmp2; - - list_for_each_entry_safe(dep, tmp2, &roc->dependents, list) { - if (!mgmt_tx && dep->cookie != cookie) - continue; - else if (mgmt_tx && dep->mgmt_tx_cookie != cookie) - continue; - /* found dependent item -- just remove it */ - list_del(&dep->list); - mutex_unlock(&local->mtx); - - ieee80211_roc_notify_destroy(dep, true); - return 0; - } - - if (!mgmt_tx && roc->cookie != cookie) - continue; - else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) - continue; - - found = roc; - break; - } - - if (!found) { - mutex_unlock(&local->mtx); - return -ENOENT; - } - - /* - * We found the item to cancel, so do that. Note that it - * may have dependents, which we also cancel (and send - * the expired signal for.) Not doing so would be quite - * tricky here, but we may need to fix it later. - */ - - if (local->ops->remain_on_channel) { - if (found->started) { - ret = drv_cancel_remain_on_channel(local); - if (WARN_ON_ONCE(ret)) { - mutex_unlock(&local->mtx); - return ret; - } - } - - list_del(&found->list); - - if (found->started) - ieee80211_start_next_roc(local); - mutex_unlock(&local->mtx); - - ieee80211_roc_notify_destroy(found, true); - } else { - /* work may be pending so use it all the time */ - found->abort = true; - ieee80211_queue_delayed_work(&local->hw, &found->work, 0); - - mutex_unlock(&local->mtx); - - /* work will clean up etc */ - flush_delayed_work(&found->work); - WARN_ON(!found->to_be_freed); - kfree(found); - } - - return 0; -} - -static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, - struct wireless_dev *wdev, - u64 cookie) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct ieee80211_local *local = sdata->local; - - return ieee80211_cancel_roc(local, cookie, false); -} - static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, @@ -3267,9 +2973,21 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, return err; } -static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, - struct sk_buff *skb, u64 *cookie, - gfp_t gfp) +u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local) +{ + lockdep_assert_held(&local->mtx); + + local->roc_cookie_counter++; + + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(local->roc_cookie_counter == 0)) + local->roc_cookie_counter++; + + return local->roc_cookie_counter; +} + +int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, + u64 *cookie, gfp_t gfp) { unsigned long spin_flags; struct sk_buff *ack_skb; @@ -3277,7 +2995,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, ack_skb = skb_copy(skb, gfp); if (!ack_skb) - return ERR_PTR(-ENOMEM); + return -ENOMEM; spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, @@ -3286,7 +3004,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, if (id < 0) { kfree_skb(ack_skb); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } IEEE80211_SKB_CB(skb)->ack_frame_id = id; @@ -3294,200 +3012,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local, *cookie = ieee80211_mgmt_tx_cookie(local); IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; - return ack_skb; -} - -static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, - struct cfg80211_mgmt_tx_params *params, - u64 *cookie) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct ieee80211_local *local = sdata->local; - struct sk_buff *skb, *ack_skb; - struct sta_info *sta; - const struct ieee80211_mgmt *mgmt = (void *)params->buf; - bool need_offchan = false; - u32 flags; - int ret; - u8 *data; - - if (params->dont_wait_for_ack) - flags = IEEE80211_TX_CTL_NO_ACK; - else - flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | - IEEE80211_TX_CTL_REQ_TX_STATUS; - - if (params->no_cck) - flags |= IEEE80211_TX_CTL_NO_CCK_RATE; - - switch (sdata->vif.type) { - case NL80211_IFTYPE_ADHOC: - if (!sdata->vif.bss_conf.ibss_joined) - need_offchan = true; - /* fall through */ -#ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: - if (ieee80211_vif_is_mesh(&sdata->vif) && - !sdata->u.mesh.mesh_id_len) - need_offchan = true; - /* fall through */ -#endif - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_GO: - if (sdata->vif.type != NL80211_IFTYPE_ADHOC && - !ieee80211_vif_is_mesh(&sdata->vif) && - !rcu_access_pointer(sdata->bss->beacon)) - need_offchan = true; - if (!ieee80211_is_action(mgmt->frame_control) || - mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || - mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || - mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) - break; - rcu_read_lock(); - sta = sta_info_get(sdata, mgmt->da); - rcu_read_unlock(); - if (!sta) - return -ENOLINK; - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - sdata_lock(sdata); - if (!sdata->u.mgd.associated || - (params->offchan && params->wait && - local->ops->remain_on_channel && - memcmp(sdata->u.mgd.associated->bssid, - mgmt->bssid, ETH_ALEN))) - need_offchan = true; - sdata_unlock(sdata); - break; - case NL80211_IFTYPE_P2P_DEVICE: - need_offchan = true; - break; - default: - return -EOPNOTSUPP; - } - - /* configurations requiring offchan cannot work if no channel has been - * specified - */ - if (need_offchan && !params->chan) - return -EINVAL; - - mutex_lock(&local->mtx); - - /* Check if the operating channel is the requested channel */ - if (!need_offchan) { - struct ieee80211_chanctx_conf *chanctx_conf; - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - - if (chanctx_conf) { - need_offchan = params->chan && - (params->chan != - chanctx_conf->def.chan); - } else if (!params->chan) { - ret = -EINVAL; - rcu_read_unlock(); - goto out_unlock; - } else { - need_offchan = true; - } - rcu_read_unlock(); - } - - if (need_offchan && !params->offchan) { - ret = -EBUSY; - goto out_unlock; - } - - skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); - if (!skb) { - ret = -ENOMEM; - goto out_unlock; - } - skb_reserve(skb, local->hw.extra_tx_headroom); - - data = skb_put(skb, params->len); - memcpy(data, params->buf, params->len); - - /* Update CSA counters */ - if (sdata->vif.csa_active && - (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT || - sdata->vif.type == NL80211_IFTYPE_ADHOC) && - params->n_csa_offsets) { - int i; - struct beacon_data *beacon = NULL; - - rcu_read_lock(); - - if (sdata->vif.type == NL80211_IFTYPE_AP) - beacon = rcu_dereference(sdata->u.ap.beacon); - else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - beacon = rcu_dereference(sdata->u.ibss.presp); - else if (ieee80211_vif_is_mesh(&sdata->vif)) - beacon = rcu_dereference(sdata->u.mesh.beacon); - - if (beacon) - for (i = 0; i < params->n_csa_offsets; i++) - data[params->csa_offsets[i]] = - beacon->csa_current_counter; - - rcu_read_unlock(); - } - - IEEE80211_SKB_CB(skb)->flags = flags; - - skb->dev = sdata->dev; - - if (!params->dont_wait_for_ack) { - /* make a copy to preserve the frame contents - * in case of encryption. - */ - ack_skb = ieee80211_make_ack_skb(local, skb, cookie, - GFP_KERNEL); - if (IS_ERR(ack_skb)) { - ret = PTR_ERR(ack_skb); - kfree_skb(skb); - goto out_unlock; - } - } else { - /* for cookie below */ - ack_skb = skb; - } - - if (!need_offchan) { - ieee80211_tx_skb(sdata, skb); - ret = 0; - goto out_unlock; - } - - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | - IEEE80211_TX_INTFL_OFFCHAN_TX_OK; - if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) - IEEE80211_SKB_CB(skb)->hw_queue = - local->hw.offchannel_tx_hw_queue; - - /* This will handle all kinds of coalescing and immediate TX */ - ret = ieee80211_start_roc_work(local, sdata, params->chan, - params->wait, cookie, skb, - IEEE80211_ROC_TYPE_MGMT_TX); - if (ret) - kfree_skb(skb); - out_unlock: - mutex_unlock(&local->mtx); - return ret; -} - -static int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, - struct wireless_dev *wdev, - u64 cookie) -{ - struct ieee80211_local *local = wiphy_priv(wiphy); - - return ieee80211_cancel_roc(local, cookie, true); + return 0; } static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, @@ -3565,7 +3090,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; - struct sk_buff *skb, *ack_skb; + struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos; @@ -3633,10 +3158,9 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, if (qos) nullfunc->qos_ctrl = cpu_to_le16(7); - ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC); - if (IS_ERR(ack_skb)) { + ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC); + if (ret) { kfree_skb(skb); - ret = PTR_ERR(ack_skb); goto unlock; } @@ -3838,6 +3362,7 @@ const struct cfg80211_ops mac80211_config_ops = { .suspend = ieee80211_suspend, .resume = ieee80211_resume, .scan = ieee80211_scan, + .abort_scan = ieee80211_abort_scan, .sched_scan_start = ieee80211_sched_scan_start, .sched_scan_stop = ieee80211_sched_scan_stop, .auth = ieee80211_auth, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 4d2aaebd4f97..3e24d0ddb51b 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -91,7 +91,7 @@ static const struct file_operations reset_ops = { }; #endif -static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), @@ -125,9 +125,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), - - /* keep last for the build bug below */ - (void *)0x1 + FLAG(NEEDS_UNIQUE_STA_ADDR), #undef FLAG }; @@ -147,7 +145,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, /* fail compilation if somebody adds or removes * a flag without updating the name array above */ - BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 337bb5d78003..f7fc0e00497f 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -428,6 +428,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, chandef.width = sdata->u.ibss.chandef.width; break; case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_160: chandef = sdata->u.ibss.chandef; chandef.chan = cbss->channel; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d832bd59236b..b84f6aa32c08 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -325,19 +325,15 @@ struct mesh_preq_queue { struct ieee80211_roc_work { struct list_head list; - struct list_head dependents; - - struct delayed_work work; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *chan; bool started, abort, hw_begun, notified; - bool to_be_freed; bool on_channel; - unsigned long hw_start_time; + unsigned long start_time; u32 duration, req_duration; struct sk_buff *frame; @@ -1335,6 +1331,7 @@ struct ieee80211_local { /* * Remain-on-channel support */ + struct delayed_work roc_work; struct list_head roc_list; struct work_struct hw_roc_start, hw_roc_done; unsigned long hw_roc_start_time; @@ -1483,6 +1480,10 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, void ieee80211_configure_filter(struct ieee80211_local *local); u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata); +u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); +int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, + u64 *cookie, gfp_t gfp); + /* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, @@ -1577,16 +1578,22 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_local *local); void ieee80211_sched_scan_end(struct ieee80211_local *local); void ieee80211_sched_scan_stopped_work(struct work_struct *work); -/* off-channel helpers */ +/* off-channel/mgmt-tx */ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local); void ieee80211_offchannel_return(struct ieee80211_local *local); void ieee80211_roc_setup(struct ieee80211_local *local); void ieee80211_start_next_roc(struct ieee80211_local *local); void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); -void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free); -void ieee80211_sw_roc_work(struct work_struct *work); -void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc); +int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + struct ieee80211_channel *chan, + unsigned int duration, u64 *cookie); +int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie); +int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_mgmt_tx_params *params, u64 *cookie); +int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie); /* channel switch handling */ void ieee80211_csa_finalize_work(struct work_struct *work); @@ -1709,10 +1716,10 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta); void ieee80211_sta_set_rx_nss(struct sta_info *sta); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only); + enum ieee80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only); + enum ieee80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d0dc1bfaeec2..c9e325d2e120 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -76,7 +76,8 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, bool update_bss) { - if (__ieee80211_recalc_txpower(sdata) || update_bss) + if (__ieee80211_recalc_txpower(sdata) || + (update_bss && ieee80211_sdata_running(sdata))) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER); } @@ -1861,6 +1862,7 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) unregister_netdevice(sdata->dev); } else { cfg80211_unregister_wdev(&sdata->wdev); + ieee80211_teardown_sdata(sdata); kfree(sdata); } } @@ -1870,7 +1872,6 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata) if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state))) return; ieee80211_do_stop(sdata, true); - ieee80211_teardown_sdata(sdata); } void ieee80211_remove_interfaces(struct ieee80211_local *local) diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 44388d6a1d8e..5e5bc599da4c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -320,7 +321,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, return; if (new) - list_add_tail(&new->list, &sdata->key_list); + list_add_tail_rcu(&new->list, &sdata->key_list); WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); @@ -368,7 +369,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, } if (old) - list_del(&old->list); + list_del_rcu(&old->list); } struct ieee80211_key * @@ -592,8 +593,8 @@ static void ieee80211_key_destroy(struct ieee80211_key *key, return; /* - * Synchronize so the TX path can no longer be using - * this key before we free/remove it. + * Synchronize so the TX path and rcu key iterators + * can no longer be using this key before we free/remove it. */ synchronize_net(); @@ -744,6 +745,53 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_iter_keys); +static void +_ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, + struct ieee80211_sub_if_data *sdata, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key, + void *data), + void *iter_data) +{ + struct ieee80211_key *key; + + list_for_each_entry_rcu(key, &sdata->key_list, list) { + /* skip keys of station in removal process */ + if (key->sta && key->sta->removed) + continue; + if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) + continue; + + iter(hw, &sdata->vif, + key->sta ? &key->sta->sta : NULL, + &key->conf, iter_data); + } +} + +void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key, + void *data), + void *iter_data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + + if (vif) { + sdata = vif_to_sdata(vif); + _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); + } else { + list_for_each_entry_rcu(sdata, &local->interfaces, list) + _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); + } +} +EXPORT_SYMBOL(ieee80211_iter_keys_rcu); + static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, struct list_head *keys) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 858f6b1cb149..6bcf0faa4a89 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1149,6 +1149,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) rtnl_unlock(); + cancel_delayed_work_sync(&local->roc_work); cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); cancel_work_sync(&local->tdls_chsw_work); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index b890e225a8f1..dadf8dc6f1cf 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -779,10 +779,8 @@ void mesh_plink_broken(struct sta_info *sta) static void mesh_path_node_reclaim(struct rcu_head *rp) { struct mpath_node *node = container_of(rp, struct mpath_node, rcu); - struct ieee80211_sub_if_data *sdata = node->mpath->sdata; del_timer_sync(&node->mpath->timer); - atomic_dec(&sdata->u.mesh.mpaths); kfree(node->mpath); kfree(node); } @@ -790,8 +788,9 @@ static void mesh_path_node_reclaim(struct rcu_head *rp) /* needs to be called with the corresponding hashwlock taken */ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) { - struct mesh_path *mpath; - mpath = node->mpath; + struct mesh_path *mpath = node->mpath; + struct ieee80211_sub_if_data *sdata = node->mpath->sdata; + spin_lock(&mpath->state_lock); mpath->flags |= MESH_PATH_RESOLVING; if (mpath->is_gate) @@ -799,6 +798,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) hlist_del_rcu(&node->list); call_rcu(&node->rcu, mesh_path_node_reclaim); spin_unlock(&mpath->state_lock); + atomic_dec(&sdata->u.mesh.mpaths); atomic_dec(&tbl->entries); } @@ -968,8 +968,8 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) copy = true; } else { mpath_dbg(sdata, - "Not forwarding %p (flags %#x)\n", - gate->mpath, gate->mpath->flags); + "Not forwarding to %pM (flags %#x)\n", + gate->mpath->dst, gate->mpath->flags); } } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b140cc6651f4..1c342e2592c4 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1379,21 +1379,26 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata, */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { + new_ap_level = pwr_level_80211h; + + if (sdata->ap_power_level == new_ap_level) + return 0; + sdata_dbg(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", pwr_level_80211h, chan_pwr, pwr_reduction_80211h, sdata->u.mgd.bssid); - new_ap_level = pwr_level_80211h; } else { /* has_cisco_pwr is always true here. */ + new_ap_level = pwr_level_cisco; + + if (sdata->ap_power_level == new_ap_level) + return 0; + sdata_dbg(sdata, "Limiting TX power to %d dBm as advertised by %pM\n", pwr_level_cisco, sdata->u.mgd.bssid); - new_ap_level = pwr_level_cisco; } - if (sdata->ap_power_level == new_ap_level) - return 0; - sdata->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(sdata)) return BSS_CHANGED_TXPOWER; @@ -1930,7 +1935,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE; - if (sdata->vif.p2p) { + if (sdata->vif.p2p || + sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); @@ -3458,7 +3464,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } } - if (sdata->vif.p2p) { + if (sdata->vif.p2p || + sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { struct ieee80211_p2p_noa_attr noa = {}; int ret; @@ -3575,7 +3582,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (sta && elems.opmode_notif) ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif, - rx_status->band, true); + rx_status->band); mutex_unlock(&local->sta_mtx); changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 04401037140e..8b2f4eaac2ba 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -187,11 +187,80 @@ void ieee80211_offchannel_return(struct ieee80211_local *local) false); } -void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc) +static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) { - if (roc->notified) + /* was never transmitted */ + if (roc->frame) { + cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, + roc->frame->data, roc->frame->len, + false, GFP_KERNEL); + ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); + } + + if (!roc->mgmt_tx_cookie) + cfg80211_remain_on_channel_expired(&roc->sdata->wdev, + roc->cookie, roc->chan, + GFP_KERNEL); + + list_del(&roc->list); + kfree(roc); +} + +static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local, + unsigned long now) +{ + struct ieee80211_roc_work *roc, *tmp; + long remaining_dur_min = LONG_MAX; + + lockdep_assert_held(&local->mtx); + + list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { + long remaining; + + if (!roc->started) + break; + + remaining = roc->start_time + + msecs_to_jiffies(roc->duration) - + now; + + /* In case of HW ROC, it is possible that the HW finished the + * ROC session before the actual requested time. In such a case + * end the ROC session (disregarding the remaining time). + */ + if (roc->abort || roc->hw_begun || remaining <= 0) + ieee80211_roc_notify_destroy(roc); + else + remaining_dur_min = min(remaining_dur_min, remaining); + } + + return remaining_dur_min; +} + +static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, + unsigned long now) +{ + long dur = ieee80211_end_finished_rocs(local, now); + + if (dur == LONG_MAX) + return false; + + mod_delayed_work(local->workqueue, &local->roc_work, dur); + return true; +} + +static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, + unsigned long start_time) +{ + struct ieee80211_local *local = roc->sdata->local; + + if (WARN_ON(roc->notified)) return; + roc->start_time = start_time; + roc->started = true; + roc->hw_begun = true; + if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, @@ -205,40 +274,26 @@ void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc) } roc->notified = true; + + if (!local->ops->remain_on_channel) + ieee80211_recalc_sw_work(local, start_time); } static void ieee80211_hw_roc_start(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_start); - struct ieee80211_roc_work *roc, *dep, *tmp; + struct ieee80211_roc_work *roc; mutex_lock(&local->mtx); - if (list_empty(&local->roc_list)) - goto out_unlock; - - roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, - list); - - if (!roc->started) - goto out_unlock; - - roc->hw_begun = true; - roc->hw_start_time = local->hw_roc_start_time; + list_for_each_entry(roc, &local->roc_list, list) { + if (!roc->started) + break; - ieee80211_handle_roc_started(roc); - list_for_each_entry_safe(dep, tmp, &roc->dependents, list) { - ieee80211_handle_roc_started(dep); - - if (dep->duration > roc->duration) { - u32 dur = dep->duration; - dep->duration = dur - roc->duration; - roc->duration = dur; - list_move(&dep->list, &roc->list); - } + ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } - out_unlock: + mutex_unlock(&local->mtx); } @@ -254,34 +309,40 @@ void ieee80211_ready_on_channel(struct ieee80211_hw *hw) } EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); -void ieee80211_start_next_roc(struct ieee80211_local *local) +static void _ieee80211_start_next_roc(struct ieee80211_local *local) { - struct ieee80211_roc_work *roc; + struct ieee80211_roc_work *roc, *tmp; + enum ieee80211_roc_type type; + u32 min_dur, max_dur; lockdep_assert_held(&local->mtx); - if (list_empty(&local->roc_list)) { - ieee80211_run_deferred_scan(local); + if (WARN_ON(list_empty(&local->roc_list))) return; - } roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); - if (WARN_ON_ONCE(roc->started)) + if (WARN_ON(roc->started)) return; - if (local->ops->remain_on_channel) { - int ret, duration = roc->duration; - - /* XXX: duplicated, see ieee80211_start_roc_work() */ - if (!duration) - duration = 10; + min_dur = roc->duration; + max_dur = roc->duration; + type = roc->type; - ret = drv_remain_on_channel(local, roc->sdata, roc->chan, - duration, roc->type); + list_for_each_entry(tmp, &local->roc_list, list) { + if (tmp == roc) + continue; + if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) + break; + max_dur = max(tmp->duration, max_dur); + min_dur = min(tmp->duration, min_dur); + type = max(tmp->type, type); + } - roc->started = true; + if (local->ops->remain_on_channel) { + int ret = drv_remain_on_channel(local, roc->sdata, roc->chan, + max_dur, type); if (ret) { wiphy_warn(local->hw.wiphy, @@ -290,74 +351,24 @@ void ieee80211_start_next_roc(struct ieee80211_local *local) * queue the work struct again to avoid recursion * when multiple failures occur */ - ieee80211_remain_on_channel_expired(&local->hw); + list_for_each_entry(tmp, &local->roc_list, list) { + if (tmp->sdata != roc->sdata || + tmp->chan != roc->chan) + break; + tmp->started = true; + tmp->abort = true; + } + ieee80211_queue_work(&local->hw, &local->hw_roc_done); + return; } - } else { - /* delay it a bit */ - ieee80211_queue_delayed_work(&local->hw, &roc->work, - round_jiffies_relative(HZ/2)); - } -} - -void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free) -{ - struct ieee80211_roc_work *dep, *tmp; - - if (WARN_ON(roc->to_be_freed)) - return; - - /* was never transmitted */ - if (roc->frame) { - cfg80211_mgmt_tx_status(&roc->sdata->wdev, - (unsigned long)roc->frame, - roc->frame->data, roc->frame->len, - false, GFP_KERNEL); - kfree_skb(roc->frame); - } - - if (!roc->mgmt_tx_cookie) - cfg80211_remain_on_channel_expired(&roc->sdata->wdev, - roc->cookie, roc->chan, - GFP_KERNEL); - - list_for_each_entry_safe(dep, tmp, &roc->dependents, list) - ieee80211_roc_notify_destroy(dep, true); - - if (free) - kfree(roc); - else - roc->to_be_freed = true; -} - -void ieee80211_sw_roc_work(struct work_struct *work) -{ - struct ieee80211_roc_work *roc = - container_of(work, struct ieee80211_roc_work, work.work); - struct ieee80211_sub_if_data *sdata = roc->sdata; - struct ieee80211_local *local = sdata->local; - bool started, on_channel; - - mutex_lock(&local->mtx); - - if (roc->to_be_freed) - goto out_unlock; - - if (roc->abort) - goto finish; - - if (WARN_ON(list_empty(&local->roc_list))) - goto out_unlock; - - if (WARN_ON(roc != list_first_entry(&local->roc_list, - struct ieee80211_roc_work, - list))) - goto out_unlock; - - if (!roc->started) { - struct ieee80211_roc_work *dep; - - WARN_ON(local->use_chanctx); + /* we'll notify about the start once the HW calls back */ + list_for_each_entry(tmp, &local->roc_list, list) { + if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) + break; + tmp->started = true; + } + } else { /* If actually operating on the desired channel (with at least * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so @@ -377,27 +388,72 @@ void ieee80211_sw_roc_work(struct work_struct *work) ieee80211_hw_config(local, 0); } - /* tell userspace or send frame */ - ieee80211_handle_roc_started(roc); - list_for_each_entry(dep, &roc->dependents, list) - ieee80211_handle_roc_started(dep); + ieee80211_queue_delayed_work(&local->hw, &local->roc_work, + msecs_to_jiffies(min_dur)); - /* if it was pure TX, just finish right away */ - if (!roc->duration) - goto finish; + /* tell userspace or send frame(s) */ + list_for_each_entry(tmp, &local->roc_list, list) { + if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) + break; - roc->started = true; - ieee80211_queue_delayed_work(&local->hw, &roc->work, - msecs_to_jiffies(roc->duration)); + tmp->on_channel = roc->on_channel; + ieee80211_handle_roc_started(tmp, jiffies); + } + } +} + +void ieee80211_start_next_roc(struct ieee80211_local *local) +{ + struct ieee80211_roc_work *roc; + + lockdep_assert_held(&local->mtx); + + if (list_empty(&local->roc_list)) { + ieee80211_run_deferred_scan(local); + return; + } + + roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, + list); + + if (WARN_ON_ONCE(roc->started)) + return; + + if (local->ops->remain_on_channel) { + _ieee80211_start_next_roc(local); + } else { + /* delay it a bit */ + ieee80211_queue_delayed_work(&local->hw, &local->roc_work, + round_jiffies_relative(HZ/2)); + } +} + +static void __ieee80211_roc_work(struct ieee80211_local *local) +{ + struct ieee80211_roc_work *roc; + bool on_channel; + + lockdep_assert_held(&local->mtx); + + if (WARN_ON(local->ops->remain_on_channel)) + return; + + roc = list_first_entry_or_null(&local->roc_list, + struct ieee80211_roc_work, list); + if (!roc) + return; + + if (!roc->started) { + WARN_ON(local->use_chanctx); + _ieee80211_start_next_roc(local); } else { - /* finish this ROC */ - finish: - list_del(&roc->list); - started = roc->started; on_channel = roc->on_channel; - ieee80211_roc_notify_destroy(roc, !roc->abort); + if (ieee80211_recalc_sw_work(local, jiffies)) + return; + + /* careful - roc pointer became invalid during recalc */ - if (started && !on_channel) { + if (!on_channel) { ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; @@ -407,14 +463,17 @@ void ieee80211_sw_roc_work(struct work_struct *work) } ieee80211_recalc_idle(local); - - if (started) - ieee80211_start_next_roc(local); - else if (list_empty(&local->roc_list)) - ieee80211_run_deferred_scan(local); + ieee80211_start_next_roc(local); } +} - out_unlock: +static void ieee80211_roc_work(struct work_struct *work) +{ + struct ieee80211_local *local = + container_of(work, struct ieee80211_local, roc_work.work); + + mutex_lock(&local->mtx); + __ieee80211_roc_work(local); mutex_unlock(&local->mtx); } @@ -422,27 +481,14 @@ static void ieee80211_hw_roc_done(struct work_struct *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_done); - struct ieee80211_roc_work *roc; mutex_lock(&local->mtx); - if (list_empty(&local->roc_list)) - goto out_unlock; - - roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, - list); - - if (!roc->started) - goto out_unlock; - - list_del(&roc->list); - - ieee80211_roc_notify_destroy(roc, true); + ieee80211_end_finished_rocs(local, jiffies); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); - out_unlock: mutex_unlock(&local->mtx); } @@ -456,47 +502,500 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) } EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); -void ieee80211_roc_setup(struct ieee80211_local *local) +static bool +ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, + struct ieee80211_roc_work *new_roc, + struct ieee80211_roc_work *cur_roc) { - INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start); - INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done); - INIT_LIST_HEAD(&local->roc_list); + unsigned long now = jiffies; + unsigned long remaining; + + if (WARN_ON(!cur_roc->started)) + return false; + + /* if it was scheduled in the hardware, but not started yet, + * we can only combine if the older one had a longer duration + */ + if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration) + return false; + + remaining = cur_roc->start_time + + msecs_to_jiffies(cur_roc->duration) - + now; + + /* if it doesn't fit entirely, schedule a new one */ + if (new_roc->duration > jiffies_to_msecs(remaining)) + return false; + + /* add just after the current one so we combine their finish later */ + list_add(&new_roc->list, &cur_roc->list); + + /* if the existing one has already begun then let this one also + * begin, otherwise they'll both be marked properly by the work + * struct that runs once the driver notifies us of the beginning + */ + if (cur_roc->hw_begun) + ieee80211_handle_roc_started(new_roc, now); + + return true; } -void ieee80211_roc_purge(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +static int ieee80211_start_roc_work(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_channel *channel, + unsigned int duration, u64 *cookie, + struct sk_buff *txskb, + enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; - LIST_HEAD(tmp_list); + bool queued = false, combine_started = true; + int ret; + + lockdep_assert_held(&local->mtx); + + if (local->use_chanctx && !local->ops->remain_on_channel) + return -EOPNOTSUPP; + + roc = kzalloc(sizeof(*roc), GFP_KERNEL); + if (!roc) + return -ENOMEM; + + /* + * If the duration is zero, then the driver + * wouldn't actually do anything. Set it to + * 10 for now. + * + * TODO: cancel the off-channel operation + * when we get the SKB's TX status and + * the wait time was zero before. + */ + if (!duration) + duration = 10; + + roc->chan = channel; + roc->duration = duration; + roc->req_duration = duration; + roc->frame = txskb; + roc->type = type; + roc->sdata = sdata; + + /* + * cookie is either the roc cookie (for normal roc) + * or the SKB (for mgmt TX) + */ + if (!txskb) { + roc->cookie = ieee80211_mgmt_tx_cookie(local); + *cookie = roc->cookie; + } else { + roc->mgmt_tx_cookie = *cookie; + } + + /* if there's no need to queue, handle it immediately */ + if (list_empty(&local->roc_list) && + !local->scanning && !ieee80211_is_radar_required(local)) { + /* if not HW assist, just queue & schedule work */ + if (!local->ops->remain_on_channel) { + list_add_tail(&roc->list, &local->roc_list); + ieee80211_queue_delayed_work(&local->hw, + &local->roc_work, 0); + } else { + /* otherwise actually kick it off here + * (for error handling) + */ + ret = drv_remain_on_channel(local, sdata, channel, + duration, type); + if (ret) { + kfree(roc); + return ret; + } + roc->started = true; + list_add_tail(&roc->list, &local->roc_list); + } + + return 0; + } + + /* otherwise handle queueing */ + + list_for_each_entry(tmp, &local->roc_list, list) { + if (tmp->chan != channel || tmp->sdata != sdata) + continue; + + /* + * Extend this ROC if possible: If it hasn't started, add + * just after the new one to combine. + */ + if (!tmp->started) { + list_add(&roc->list, &tmp->list); + queued = true; + break; + } + + if (!combine_started) + continue; + + if (!local->ops->remain_on_channel) { + /* If there's no hardware remain-on-channel, and + * doing so won't push us over the maximum r-o-c + * we allow, then we can just add the new one to + * the list and mark it as having started now. + * If it would push over the limit, don't try to + * combine with other started ones (that haven't + * been running as long) but potentially sort it + * with others that had the same fate. + */ + unsigned long now = jiffies; + u32 elapsed = jiffies_to_msecs(now - tmp->start_time); + struct wiphy *wiphy = local->hw.wiphy; + u32 max_roc = wiphy->max_remain_on_channel_duration; + + if (elapsed + roc->duration > max_roc) { + combine_started = false; + continue; + } + + list_add(&roc->list, &tmp->list); + queued = true; + roc->on_channel = tmp->on_channel; + ieee80211_handle_roc_started(roc, now); + break; + } + + queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp); + if (queued) + break; + /* if it wasn't queued, perhaps it can be combined with + * another that also couldn't get combined previously, + * but no need to check for already started ones, since + * that can't work. + */ + combine_started = false; + } + + if (!queued) + list_add_tail(&roc->list, &local->roc_list); + + return 0; +} + +int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, + struct ieee80211_channel *chan, + unsigned int duration, u64 *cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_local *local = sdata->local; + int ret; + + mutex_lock(&local->mtx); + ret = ieee80211_start_roc_work(local, sdata, chan, + duration, cookie, NULL, + IEEE80211_ROC_TYPE_NORMAL); + mutex_unlock(&local->mtx); + + return ret; +} + +static int ieee80211_cancel_roc(struct ieee80211_local *local, + u64 cookie, bool mgmt_tx) +{ + struct ieee80211_roc_work *roc, *tmp, *found = NULL; + int ret; + + if (!cookie) + return -ENOENT; mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { - if (sdata && roc->sdata != sdata) + if (!mgmt_tx && roc->cookie != cookie) continue; + else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) + continue; + + found = roc; + break; + } + + if (!found) { + mutex_unlock(&local->mtx); + return -ENOENT; + } + + if (!found->started) { + ieee80211_roc_notify_destroy(found); + goto out_unlock; + } - if (roc->started && local->ops->remain_on_channel) { - /* can race, so ignore return value */ - drv_cancel_remain_on_channel(local); + if (local->ops->remain_on_channel) { + ret = drv_cancel_remain_on_channel(local); + if (WARN_ON_ONCE(ret)) { + mutex_unlock(&local->mtx); + return ret; + } + + /* TODO: + * if multiple items were combined here then we really shouldn't + * cancel them all - we should wait for as much time as needed + * for the longest remaining one, and only then cancel ... + */ + list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { + if (!roc->started) + break; + if (roc == found) + found = NULL; + ieee80211_roc_notify_destroy(roc); } - list_move_tail(&roc->list, &tmp_list); - roc->abort = true; + /* that really must not happen - it was started */ + WARN_ON(found); + + ieee80211_start_next_roc(local); + } else { + /* go through work struct to return to the operating channel */ + found->abort = true; + mod_delayed_work(local->workqueue, &local->roc_work, 0); } + + out_unlock: mutex_unlock(&local->mtx); - list_for_each_entry_safe(roc, tmp, &tmp_list, list) { - if (local->ops->remain_on_channel) { - list_del(&roc->list); - ieee80211_roc_notify_destroy(roc, true); + return 0; +} + +int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_local *local = sdata->local; + + return ieee80211_cancel_roc(local, cookie, false); +} + +int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_mgmt_tx_params *params, u64 *cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct sta_info *sta; + const struct ieee80211_mgmt *mgmt = (void *)params->buf; + bool need_offchan = false; + u32 flags; + int ret; + u8 *data; + + if (params->dont_wait_for_ack) + flags = IEEE80211_TX_CTL_NO_ACK; + else + flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_REQ_TX_STATUS; + + if (params->no_cck) + flags |= IEEE80211_TX_CTL_NO_CCK_RATE; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_ADHOC: + if (!sdata->vif.bss_conf.ibss_joined) + need_offchan = true; + /* fall through */ +#ifdef CONFIG_MAC80211_MESH + case NL80211_IFTYPE_MESH_POINT: + if (ieee80211_vif_is_mesh(&sdata->vif) && + !sdata->u.mesh.mesh_id_len) + need_offchan = true; + /* fall through */ +#endif + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + if (sdata->vif.type != NL80211_IFTYPE_ADHOC && + !ieee80211_vif_is_mesh(&sdata->vif) && + !rcu_access_pointer(sdata->bss->beacon)) + need_offchan = true; + if (!ieee80211_is_action(mgmt->frame_control) || + mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || + mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || + mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) + break; + rcu_read_lock(); + sta = sta_info_get(sdata, mgmt->da); + rcu_read_unlock(); + if (!sta) + return -ENOLINK; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + sdata_lock(sdata); + if (!sdata->u.mgd.associated || + (params->offchan && params->wait && + local->ops->remain_on_channel && + memcmp(sdata->u.mgd.associated->bssid, + mgmt->bssid, ETH_ALEN))) + need_offchan = true; + sdata_unlock(sdata); + break; + case NL80211_IFTYPE_P2P_DEVICE: + need_offchan = true; + break; + default: + return -EOPNOTSUPP; + } + + /* configurations requiring offchan cannot work if no channel has been + * specified + */ + if (need_offchan && !params->chan) + return -EINVAL; + + mutex_lock(&local->mtx); + + /* Check if the operating channel is the requested channel */ + if (!need_offchan) { + struct ieee80211_chanctx_conf *chanctx_conf; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + + if (chanctx_conf) { + need_offchan = params->chan && + (params->chan != + chanctx_conf->def.chan); + } else if (!params->chan) { + ret = -EINVAL; + rcu_read_unlock(); + goto out_unlock; } else { - ieee80211_queue_delayed_work(&local->hw, &roc->work, 0); + need_offchan = true; + } + rcu_read_unlock(); + } + + if (need_offchan && !params->offchan) { + ret = -EBUSY; + goto out_unlock; + } + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); + if (!skb) { + ret = -ENOMEM; + goto out_unlock; + } + skb_reserve(skb, local->hw.extra_tx_headroom); + + data = skb_put(skb, params->len); + memcpy(data, params->buf, params->len); + + /* Update CSA counters */ + if (sdata->vif.csa_active && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT || + sdata->vif.type == NL80211_IFTYPE_ADHOC) && + params->n_csa_offsets) { + int i; + struct beacon_data *beacon = NULL; + + rcu_read_lock(); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + beacon = rcu_dereference(sdata->u.ap.beacon); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + beacon = rcu_dereference(sdata->u.ibss.presp); + else if (ieee80211_vif_is_mesh(&sdata->vif)) + beacon = rcu_dereference(sdata->u.mesh.beacon); + + if (beacon) + for (i = 0; i < params->n_csa_offsets; i++) + data[params->csa_offsets[i]] = + beacon->csa_current_counter; + + rcu_read_unlock(); + } - /* work will clean up etc */ - flush_delayed_work(&roc->work); - WARN_ON(!roc->to_be_freed); - kfree(roc); + IEEE80211_SKB_CB(skb)->flags = flags; + + skb->dev = sdata->dev; + + if (!params->dont_wait_for_ack) { + /* make a copy to preserve the frame contents + * in case of encryption. + */ + ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL); + if (ret) { + kfree_skb(skb); + goto out_unlock; } + } else { + /* Assign a dummy non-zero cookie, it's not sent to + * userspace in this case but we rely on its value + * internally in the need_offchan case to distinguish + * mgmt-tx from remain-on-channel. + */ + *cookie = 0xffffffff; } - WARN_ON_ONCE(!list_empty(&tmp_list)); + if (!need_offchan) { + ieee80211_tx_skb(sdata, skb); + ret = 0; + goto out_unlock; + } + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | + IEEE80211_TX_INTFL_OFFCHAN_TX_OK; + if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) + IEEE80211_SKB_CB(skb)->hw_queue = + local->hw.offchannel_tx_hw_queue; + + /* This will handle all kinds of coalescing and immediate TX */ + ret = ieee80211_start_roc_work(local, sdata, params->chan, + params->wait, cookie, skb, + IEEE80211_ROC_TYPE_MGMT_TX); + if (ret) + ieee80211_free_txskb(&local->hw, skb); + out_unlock: + mutex_unlock(&local->mtx); + return ret; +} + +int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + return ieee80211_cancel_roc(local, cookie, true); +} + +void ieee80211_roc_setup(struct ieee80211_local *local) +{ + INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start); + INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done); + INIT_DELAYED_WORK(&local->roc_work, ieee80211_roc_work); + INIT_LIST_HEAD(&local->roc_list); +} + +void ieee80211_roc_purge(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_roc_work *roc, *tmp; + bool work_to_do = false; + + mutex_lock(&local->mtx); + list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { + if (sdata && roc->sdata != sdata) + continue; + + if (roc->started) { + if (local->ops->remain_on_channel) { + /* can race, so ignore return value */ + drv_cancel_remain_on_channel(local); + ieee80211_roc_notify_destroy(roc); + } else { + roc->abort = true; + work_to_do = true; + } + } else { + ieee80211_roc_notify_destroy(roc); + } + } + if (work_to_do) + __ieee80211_roc_work(local); + mutex_unlock(&local->mtx); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8bae5de0dc44..bc081850ac0e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -661,8 +661,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - WARN_ONCE((unsigned long)rx->skb->data & 1, - "unaligned packet at 0x%p\n", rx->skb->data); + WARN_ON_ONCE((unsigned long)rx->skb->data & 1); #endif } @@ -2736,8 +2735,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; ieee80211_vht_handle_opmode(rx->sdata, rx->sta, - opmode, status->band, - false); + opmode, status->band); goto handled; } default: diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4aeca4b0c3cb..a413e52f7691 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -597,8 +597,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, /* We need to ensure power level is at max for scanning. */ ieee80211_hw_config(local, 0); - if ((req->channels[0]->flags & - IEEE80211_CHAN_NO_IR) || + if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | + IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; } else { @@ -645,7 +645,7 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ - if (chan->flags & IEEE80211_CHAN_NO_IR) + if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } @@ -777,7 +777,8 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, * * In any case, it is not necessary for a passive scan. */ - if (chan->flags & IEEE80211_CHAN_NO_IR || !scan_req->n_ssids) { + if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || + !scan_req->n_ssids) { *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; return; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f91d1873218c..4402ad5b27d1 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -435,6 +436,19 @@ static int sta_info_insert_check(struct sta_info *sta) is_multicast_ether_addr(sta->sta.addr))) return -EINVAL; + /* Strictly speaking this isn't necessary as we hold the mutex, but + * the rhashtable code can't really deal with that distinction. We + * do require the mutex for correctness though. + */ + rcu_read_lock(); + lockdep_assert_held(&sdata->local->sta_mtx); + if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && + ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { + rcu_read_unlock(); + return -ENOTUNIQ; + } + rcu_read_unlock(); + return 0; } @@ -554,14 +568,15 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) might_sleep(); + mutex_lock(&local->sta_mtx); + err = sta_info_insert_check(sta); if (err) { + mutex_unlock(&local->sta_mtx); rcu_read_lock(); goto out_free; } - mutex_lock(&local->sta_mtx); - err = sta_info_insert_finish(sta); if (err) goto out_free; @@ -868,6 +883,7 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta) } list_del_rcu(&sta->list); + sta->removed = true; drv_sta_pre_rcu_remove(local, sta->sdata, sta); @@ -1230,11 +1246,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) ieee80211_check_fast_xmit(sta); } -static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, int tid, +static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, - bool call_driver) + bool call_driver, bool more_data) { + struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; @@ -1274,9 +1290,13 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata, if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); - if (reason == IEEE80211_FRAME_RELEASE_UAPSD) + if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); + if (more_data) + nullfunc->frame_control |= + cpu_to_le16(IEEE80211_FCTL_MOREDATA); + } } info = IEEE80211_SKB_CB(skb); @@ -1323,22 +1343,48 @@ static int find_highest_prio_tid(unsigned long tids) return fls(tids) - 1; } +/* Indicates if the MORE_DATA bit should be set in the last + * frame obtained by ieee80211_sta_ps_get_frames. + * Note that driver_release_tids is relevant only if + * reason = IEEE80211_FRAME_RELEASE_PSPOLL + */ +static bool +ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, + enum ieee80211_frame_release_type reason, + unsigned long driver_release_tids) +{ + int ac; + + /* If the driver has data on more than one TID then + * certainly there's more data if we release just a + * single frame now (from a single TID). This will + * only happen for PS-Poll. + */ + if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && + hweight16(driver_release_tids) > 1) + return true; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + if (ignored_acs & BIT(ac)) + continue; + + if (!skb_queue_empty(&sta->tx_filtered[ac]) || + !skb_queue_empty(&sta->ps_tx_buf[ac])) + return true; + } + + return false; +} + static void -ieee80211_sta_ps_deliver_response(struct sta_info *sta, - int n_frames, u8 ignored_acs, - enum ieee80211_frame_release_type reason) +ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, + enum ieee80211_frame_release_type reason, + struct sk_buff_head *frames, + unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - bool more_data = false; int ac; - unsigned long driver_release_tids = 0; - struct sk_buff_head frames; - - /* Service or PS-Poll period starts */ - set_sta_flag(sta, WLAN_STA_SP); - - __skb_queue_head_init(&frames); /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -1352,26 +1398,13 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* if we already have frames from software, then we can't also * release from hardware queues */ - if (skb_queue_empty(&frames)) { - driver_release_tids |= sta->driver_buffered_tids & tids; - driver_release_tids |= sta->txq_buffered_tids & tids; + if (skb_queue_empty(frames)) { + *driver_release_tids |= + sta->driver_buffered_tids & tids; + *driver_release_tids |= sta->txq_buffered_tids & tids; } - if (driver_release_tids) { - /* If the driver has data on more than one TID then - * certainly there's more data if we release just a - * single frame now (from a single TID). This will - * only happen for PS-Poll. - */ - if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && - hweight16(driver_release_tids) > 1) { - more_data = true; - driver_release_tids = - BIT(find_highest_prio_tid( - driver_release_tids)); - break; - } - } else { + if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { @@ -1385,20 +1418,44 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, if (!skb) break; n_frames--; - __skb_queue_tail(&frames, skb); + __skb_queue_tail(frames, skb); } } - /* If we have more frames buffered on this AC, then set the - * more-data bit and abort the loop since we can't send more - * data from other ACs before the buffered frames from this. + /* If we have more frames buffered on this AC, then abort the + * loop since we can't send more data from other ACs before + * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || - !skb_queue_empty(&sta->ps_tx_buf[ac])) { - more_data = true; + !skb_queue_empty(&sta->ps_tx_buf[ac])) break; - } } +} + +static void +ieee80211_sta_ps_deliver_response(struct sta_info *sta, + int n_frames, u8 ignored_acs, + enum ieee80211_frame_release_type reason) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + unsigned long driver_release_tids = 0; + struct sk_buff_head frames; + bool more_data; + + /* Service or PS-Poll period starts */ + set_sta_flag(sta, WLAN_STA_SP); + + __skb_queue_head_init(&frames); + + ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, + &frames, &driver_release_tids); + + more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); + + if (reason == IEEE80211_FRAME_RELEASE_PSPOLL) + driver_release_tids = + BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid; @@ -1421,7 +1478,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* This will evaluate to 1, 3, 5 or 7. */ tid = 7 - ((ffs(~ignored_acs) - 1) << 1); - ieee80211_send_null_response(sdata, sta, tid, reason, true); + ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; @@ -1521,8 +1578,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, if (need_null) ieee80211_send_null_response( - sdata, sta, find_highest_prio_tid(tids), - reason, false); + sta, find_highest_prio_tid(tids), + reason, false, false); sta_info_recalc_tim(sta); } else { @@ -1660,6 +1717,22 @@ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) } EXPORT_SYMBOL(ieee80211_sta_eosp); +void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + enum ieee80211_frame_release_type reason; + bool more_data; + + trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); + + reason = IEEE80211_FRAME_RELEASE_UAPSD; + more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, + reason, 0); + + ieee80211_send_null_response(sta, tid, reason, false, more_data); +} +EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); + void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 2cafb21b422f..d6051629ed15 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -367,6 +367,7 @@ DECLARE_EWMA(signal, 1024, 8) * @mesh: mesh STA information * @debugfs: debug filesystem info * @dead: set to true when sta is unlinked + * @removed: set to true when sta is being removed from sta_list * @uploaded: set to true when sta is uploaded to the driver * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) @@ -412,6 +413,7 @@ struct sta_info { u16 listen_interval; bool dead; + bool removed; bool uploaded; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 56c6d6cfa5a1..a6b4442776a0 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2027,6 +2027,31 @@ TRACE_EVENT(api_eosp, ) ); +TRACE_EVENT(api_send_eosp_nullfunc, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, + u8 tid), + + TP_ARGS(local, sta, tid), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, tid) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->tid = tid; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " tid:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->tid + ) +); + TRACE_EVENT(api_sta_set_buffered, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sta *sta, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index bdc224d5053a..3311ce0f3d6c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1431,7 +1431,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)]; } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { - dev_kfree_skb(skb); + ieee80211_purge_tx_queue(&local->hw, skbs); return true; } else vif = NULL; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 74058020b7d6..3943d4bf289c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -288,10 +288,13 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (!test_bit(reason, &local->queue_stop_reasons[queue])) return; - if (!refcounted) + if (!refcounted) { local->q_stop_reasons[queue][reason] = 0; - else + } else { local->q_stop_reasons[queue][reason]--; + if (WARN_ON(local->q_stop_reasons[queue][reason] < 0)) + local->q_stop_reasons[queue][reason] = 0; + } if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); @@ -1641,6 +1644,29 @@ void ieee80211_stop_device(struct ieee80211_local *local) drv_stop(local); } +static void ieee80211_flush_completed_scan(struct ieee80211_local *local, + bool aborted) +{ + /* It's possible that we don't handle the scan completion in + * time during suspend, so if it's still marked as completed + * here, queue the work and flush it to clean things up. + * Instead of calling the worker function directly here, we + * really queue it to avoid potential races with other flows + * scheduling the same work. + */ + if (test_bit(SCAN_COMPLETED, &local->scanning)) { + /* If coming from reconfiguration failure, abort the scan so + * we don't attempt to continue a partial HW scan - which is + * possible otherwise if (e.g.) the 2.4 GHz portion was the + * completed scan, and a 5 GHz portion is still pending. + */ + if (aborted) + set_bit(SCAN_ABORTED, &local->scanning); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); + flush_delayed_work(&local->scan_work); + } +} + static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; @@ -1660,6 +1686,8 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) local->suspended = false; local->in_reconfig = false; + ieee80211_flush_completed_scan(local, true); + /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state */ @@ -1698,6 +1726,27 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local, mutex_unlock(&local->chanctx_mtx); } +static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + /* add STAs back */ + mutex_lock(&local->sta_mtx); + list_for_each_entry(sta, &local->sta_list, list) { + enum ieee80211_sta_state state; + + if (!sta->uploaded || sta->sdata != sdata) + continue; + + for (state = IEEE80211_STA_NOTEXIST; + state < sta->sta_state; state++) + WARN_ON(drv_sta_state(local, sta->sdata, sta, state, + state + 1)); + } + mutex_unlock(&local->sta_mtx); +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -1833,50 +1882,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); - list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - ieee80211_assign_chanctx(local, sdata); - } - sdata = rtnl_dereference(local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata); } - /* add STAs back */ - mutex_lock(&local->sta_mtx); - list_for_each_entry(sta, &local->sta_list, list) { - enum ieee80211_sta_state state; - - if (!sta->uploaded) - continue; - - /* AP-mode stations will be added later */ - if (sta->sdata->vif.type == NL80211_IFTYPE_AP) - continue; - - for (state = IEEE80211_STA_NOTEXIST; - state < sta->sta_state; state++) - WARN_ON(drv_sta_state(local, sta->sdata, sta, state, - state + 1)); - } - mutex_unlock(&local->sta_mtx); - - /* reconfigure tx conf */ - if (hw->queues >= IEEE80211_NUM_ACS) { - list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MONITOR || - !ieee80211_sdata_running(sdata)) - continue; - - for (i = 0; i < IEEE80211_NUM_ACS; i++) - drv_conf_tx(local, sdata, i, - &sdata->tx_conf[i]); - } - } - /* reconfigure hardware */ ieee80211_hw_config(local, ~0); @@ -1889,6 +1899,22 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (!ieee80211_sdata_running(sdata)) continue; + ieee80211_assign_chanctx(local, sdata); + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MONITOR: + break; + default: + ieee80211_reconfig_stations(sdata); + /* fall through */ + case NL80211_IFTYPE_AP: /* AP stations are handled later */ + for (i = 0; i < IEEE80211_NUM_ACS; i++) + drv_conf_tx(local, sdata, i, + &sdata->tx_conf[i]); + break; + } + /* common change flags for all interface types */ changed = BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | @@ -2074,17 +2100,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mb(); local->resuming = false; - /* It's possible that we don't handle the scan completion in - * time during suspend, so if it's still marked as completed - * here, queue the work and flush it to clean things up. - * Instead of calling the worker function directly here, we - * really queue it to avoid potential races with other flows - * scheduling the same work. - */ - if (test_bit(SCAN_COMPLETED, &local->scanning)) { - ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); - flush_delayed_work(&local->scan_work); - } + ieee80211_flush_completed_scan(local, false); if (local->open_count && !reconfig_due_to_wowlan) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index ff1c798921a6..c38b2f07a919 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -378,7 +378,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only) + enum ieee80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -401,9 +401,6 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, changed |= IEEE80211_RC_NSS_CHANGED; } - if (nss_only) - return changed; - switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; @@ -430,13 +427,12 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band, bool nss_only) + enum ieee80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; - u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, - band, nss_only); + u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); if (changed > 0) rate_control_rate_update(local, sband, sta, changed); diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index 0550f3365e33..fd9daf2ecec9 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -18,9 +18,6 @@ drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) static inline int drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) { - /* don't allow other operations while sync xmit */ - ASSERT_RTNL(); - might_sleep(); return local->ops->xmit_sync(&local->hw, skb); diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c index 8606da459ff3..3db16346cab3 100644 --- a/net/mac802154/mac_cmd.c +++ b/net/mac802154/mac_cmd.c @@ -126,7 +126,7 @@ static void mac802154_get_mac_params(struct net_device *dev, params->lbt = wpan_dev->lbt; } -static struct ieee802154_llsec_ops mac802154_llsec_ops = { +static const struct ieee802154_llsec_ops mac802154_llsec_ops = { .get_params = mac802154_get_params, .set_params = mac802154_set_params, .add_key = mac802154_add_key, diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 42e96729dae6..446e1300383e 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -217,8 +217,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, break; } - if (skb) - kfree_skb(skb); + kfree_skb(skb); } static void diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 3827f359b336..7e253455f9dd 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -38,12 +38,6 @@ void ieee802154_xmit_worker(struct work_struct *work) struct net_device *dev = skb->dev; int res; - rtnl_lock(); - - /* check if ifdown occurred while schedule */ - if (!netif_running(dev)) - goto err_tx; - res = drv_xmit_sync(local, skb); if (res) goto err_tx; @@ -53,14 +47,11 @@ void ieee802154_xmit_worker(struct work_struct *work) dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; - rtnl_unlock(); - return; err_tx: /* Restart the netif queue on each sub_if_data object. */ ieee802154_wake_queue(&local->hw); - rtnl_unlock(); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c70d750148b6..b18c5ed42d95 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -27,6 +27,8 @@ */ #define MAX_MP_SELECT_LABELS 4 +#define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) + static int zero = 0; static int label_limit = (1 << 20) - 1; @@ -96,22 +98,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); -static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, - struct sk_buff *skb, bool bos) +static u32 mpls_multipath_hash(struct mpls_route *rt, + struct sk_buff *skb, bool bos) { struct mpls_entry_decoded dec; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; - int nh_index = 0; u32 hash = 0; - /* No need to look further into packet if there's only - * one path - */ - if (rt->rt_nhn == 1) - goto out; - for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos; label_index++) { if (!pskb_may_pull(skb, sizeof(*hdr) * label_index)) @@ -165,7 +160,38 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, } } - nh_index = hash % rt->rt_nhn; + return hash; +} + +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, + struct sk_buff *skb, bool bos) +{ + int alive = ACCESS_ONCE(rt->rt_nhn_alive); + u32 hash = 0; + int nh_index = 0; + int n = 0; + + /* No need to look further into packet if there's only + * one path + */ + if (rt->rt_nhn == 1) + goto out; + + if (alive <= 0) + return NULL; + + hash = mpls_multipath_hash(rt, skb, bos); + nh_index = hash % alive; + if (alive == rt->rt_nhn) + goto out; + for_nexthops(rt) { + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + continue; + if (n == nh_index) + return nh; + n++; + } endfor_nexthops(rt); + out: return &rt->rt_nh[nh_index]; } @@ -317,7 +343,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, } } - err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb); + /* If via wasn't specified then send out using device address */ + if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) + err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, + out_dev->dev_addr, skb); + else + err = neigh_xmit(nh->nh_via_table, out_dev, + mpls_nh_via(rt, nh), skb); if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); @@ -365,6 +397,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen) GFP_KERNEL); if (rt) { rt->rt_nhn = num_nh; + rt->rt_nhn_alive = num_nh; rt->rt_max_alen = max_alen_aligned; } @@ -534,8 +567,22 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, if (!mpls_dev_get(dev)) goto errout; + if ((nh->nh_via_table == NEIGH_LINK_TABLE) && + (dev->addr_len != nh->nh_via_alen)) + goto errout; + RCU_INIT_POINTER(nh->nh_dev, dev); + if (!(dev->flags & IFF_UP)) { + nh->nh_flags |= RTNH_F_DEAD; + } else { + unsigned int flags; + + flags = dev_get_flags(dev); + if (!(flags & (IFF_RUNNING | IFF_LOWER_UP))) + nh->nh_flags |= RTNH_F_LINKDOWN; + } + return 0; errout: @@ -570,6 +617,9 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, if (err) goto errout; + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + rt->rt_nhn_alive--; + return 0; errout: @@ -577,8 +627,8 @@ errout: } static int mpls_nh_build(struct net *net, struct mpls_route *rt, - struct mpls_nh *nh, int oif, - struct nlattr *via, struct nlattr *newdst) + struct mpls_nh *nh, int oif, struct nlattr *via, + struct nlattr *newdst) { int err = -ENOMEM; @@ -592,10 +642,14 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, goto errout; } - err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, - __mpls_nh_via(rt, nh)); - if (err) - goto errout; + if (via) { + err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table, + __mpls_nh_via(rt, nh)); + if (err) + goto errout; + } else { + nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC; + } err = mpls_nh_assign_dev(net, rt, nh, oif); if (err) @@ -677,15 +731,14 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); } - if (!nla_via) - goto errout; - err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, - rtnh->rtnh_ifindex, nla_via, - nla_newdst); + rtnh->rtnh_ifindex, nla_via, nla_newdst); if (err) goto errout; + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + rt->rt_nhn_alive--; + rtnh = rtnh_next(rtnh, &remaining); nhs++; } endfor_nexthops(rt); @@ -875,34 +928,74 @@ free: return ERR_PTR(err); } -static void mpls_ifdown(struct net_device *dev) +static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - struct mpls_dev *mdev; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); + if (!rt) continue; - for_nexthops(rt) { + + change_nexthops(rt) { if (rtnl_dereference(nh->nh_dev) != dev) continue; - nh->nh_dev = NULL; + switch (event) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + nh->nh_flags |= RTNH_F_DEAD; + /* fall through */ + case NETDEV_CHANGE: + nh->nh_flags |= RTNH_F_LINKDOWN; + ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; + break; + } + if (event == NETDEV_UNREGISTER) + RCU_INIT_POINTER(nh->nh_dev, NULL); } endfor_nexthops(rt); } - mdev = mpls_dev_get(dev); - if (!mdev) - return; - mpls_dev_sysctl_unregister(mdev); + return; +} + +static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) +{ + struct mpls_route __rcu **platform_label; + struct net *net = dev_net(dev); + unsigned index; + int alive; + + platform_label = rtnl_dereference(net->mpls.platform_label); + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = rtnl_dereference(platform_label[index]); + + if (!rt) + continue; + + alive = 0; + change_nexthops(rt) { + struct net_device *nh_dev = + rtnl_dereference(nh->nh_dev); - RCU_INIT_POINTER(dev->mpls_ptr, NULL); + if (!(nh->nh_flags & nh_flags)) { + alive++; + continue; + } + if (nh_dev != dev) + continue; + alive++; + nh->nh_flags &= ~nh_flags; + } endfor_nexthops(rt); + + ACCESS_ONCE(rt->rt_nhn_alive) = alive; + } - kfree_rcu(mdev, rcu); + return; } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, @@ -910,9 +1003,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpls_dev *mdev; + unsigned int flags; - switch(event) { - case NETDEV_REGISTER: + if (event == NETDEV_REGISTER) { /* For now just support ethernet devices */ if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK)) { @@ -920,10 +1013,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); } - break; + return NOTIFY_OK; + } + + mdev = mpls_dev_get(dev); + if (!mdev) + return NOTIFY_OK; + switch (event) { + case NETDEV_DOWN: + mpls_ifdown(dev, event); + break; + case NETDEV_UP: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); + else + mpls_ifup(dev, RTNH_F_DEAD); + break; + case NETDEV_CHANGE: + flags = dev_get_flags(dev); + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); + else + mpls_ifdown(dev, event); + break; case NETDEV_UNREGISTER: - mpls_ifdown(dev); + mpls_ifdown(dev, event); + mdev = mpls_dev_get(dev); + if (mdev) { + mpls_dev_sysctl_unregister(mdev); + RCU_INIT_POINTER(dev->mpls_ptr, NULL); + kfree_rcu(mdev, rcu); + } break; case NETDEV_CHANGENAME: mdev = mpls_dev_get(dev); @@ -1118,6 +1240,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; + cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; @@ -1231,15 +1354,22 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; - if (nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && + nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; dev = rtnl_dereference(nh->nh_dev); if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; + if (nh->nh_flags & RTNH_F_LINKDOWN) + rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (nh->nh_flags & RTNH_F_DEAD) + rtm->rtm_flags |= RTNH_F_DEAD; } else { struct rtnexthop *rtnh; struct nlattr *mp; + int dead = 0; + int linkdown = 0; mp = nla_nest_start(skb, RTA_MULTIPATH); if (!mp) @@ -1253,11 +1383,21 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, dev = rtnl_dereference(nh->nh_dev); if (dev) rtnh->rtnh_ifindex = dev->ifindex; + if (nh->nh_flags & RTNH_F_LINKDOWN) { + rtnh->rtnh_flags |= RTNH_F_LINKDOWN; + linkdown++; + } + if (nh->nh_flags & RTNH_F_DEAD) { + rtnh->rtnh_flags |= RTNH_F_DEAD; + dead++; + } + if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, nh->nh_label)) goto nla_put_failure; - if (nla_put_via(skb, nh->nh_via_table, + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC && + nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; @@ -1266,6 +1406,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; } endfor_nexthops(rt); + if (linkdown == rt->rt_nhn) + rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (dead == rt->rt_nhn) + rtm->rtm_flags |= RTNH_F_DEAD; + nla_nest_end(skb, mp); } @@ -1319,7 +1464,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) if (nh->nh_dev) payload += nla_total_size(4); /* RTA_OIF */ - payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */ + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */ + payload += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) /* RTA_NEWDST */ payload += nla_total_size(nh->nh_labels * 4); } else { @@ -1328,7 +1474,9 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) for_nexthops(rt) { nhsize += nla_total_size(sizeof(struct rtnexthop)); - nhsize += nla_total_size(2 + nh->nh_via_alen); + /* RTA_VIA */ + if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) + nhsize += nla_total_size(2 + nh->nh_via_alen); if (nh->nh_labels) nhsize += nla_total_size(nh->nh_labels * 4); } endfor_nexthops(rt); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index bde52ce88c94..732a5c17e986 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -41,6 +41,7 @@ enum mpls_payload_type { struct mpls_nh { /* next hop label forwarding entry */ struct net_device __rcu *nh_dev; + unsigned int nh_flags; u32 nh_label[MAX_NEW_LABELS]; u8 nh_labels; u8 nh_via_alen; @@ -74,6 +75,7 @@ struct mpls_route { /* next hop label forwarding entry */ u8 rt_payload_type; u8 rt_max_alen; unsigned int rt_nhn; + unsigned int rt_nhn_alive; struct mpls_nh rt_nh[0]; }; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 67591aef9cae..fb31aa87de81 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) return en->labels * sizeof(struct mpls_shim_hdr); } -int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; @@ -54,10 +54,10 @@ int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) unsigned int ttl; /* Obtain the ttl */ - if (skb->protocol == htons(ETH_P_IP)) { + if (dst->ops->family == AF_INET) { ttl = ip_hdr(skb)->ttl; rt = (struct rtable *)dst; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (dst->ops->family == AF_INET6) { ttl = ipv6_hdr(skb)->hop_limit; rt6 = (struct rt6_info *)dst; } else { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e22349ea7256..8c067e6663a1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -563,6 +563,28 @@ config NFT_COMPAT x_tables match/target extensions over the nf_tables framework. +if NF_TABLES_NETDEV + +config NF_DUP_NETDEV + tristate "Netfilter packet duplication support" + help + This option enables the generic packet duplication infrastructure + for Netfilter. + +config NFT_DUP_NETDEV + tristate "Netfilter nf_tables netdev packet duplication support" + select NF_DUP_NETDEV + help + This option enables packet duplication for the "netdev" family. + +config NFT_FWD_NETDEV + tristate "Netfilter nf_tables netdev packet forwarding support" + select NF_DUP_NETDEV + help + This option enables packet forwarding for the "netdev" family. + +endif # NF_TABLES_NETDEV + endif # NF_TABLES config NETFILTER_XTABLES @@ -869,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IP6_NF_IPTABLES + select NF_DUP_IPV6 if IP6_NF_IPTABLES != n ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. @@ -882,7 +904,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on IP_NF_MANGLE select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `TPROXY' target, which is somewhat similar to REDIRECT. It can only be used in the mangle table and is useful @@ -1375,7 +1397,7 @@ config NETFILTER_XT_MATCH_SOCKET depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n select NF_DEFRAG_IPV4 - select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n help This option adds a `socket' match, which can be used to match packets for which a TCP or UDP socket lookup finds a valid socket. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 7638c36b498c..69134541d65b 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -66,8 +66,11 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o # SYNPROXY obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o +# generic packet duplication from netdev family +obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o + # nf_tables -nf_tables-objs += nf_tables_core.o nf_tables_api.o +nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o @@ -90,6 +93,10 @@ obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o +# nf_tables netdev +obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o +obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index d05e759ed0fa..b0bc475f641e 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -33,7 +33,7 @@ #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype MTYPE -#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id)) +#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) @@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set) del_timer_sync(&map->gc); ip_set_free(map->members); - if (set->dsize) { - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set); - ip_set_free(map->extensions); - } - kfree(map); + if (set->dsize && set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set); + ip_set_free(map); set->data = NULL; } @@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; + size_t memsize = sizeof(*map) + map->memsize; nested = ipset_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || - nla_put_net32(skb, IPSET_ATTR_MEMSIZE, - htonl(sizeof(*map) + - map->memsize + - set->dsize * map->elements))) + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 64a564334418..4783efff0bde 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -41,7 +41,6 @@ MODULE_ALIAS("ip_set_bitmap:ip"); /* Type structure */ struct bitmap_ip { void *members; /* the set members */ - void *extensions; /* data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ @@ -49,6 +48,8 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -224,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], pr_debug("hosts %u, elements %llu\n", hosts, (unsigned long long)elements); - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ip; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { kfree(map); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 1430535118fb..29dde208381d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -47,24 +47,26 @@ enum { /* Type structure */ struct bitmap_ipmac { void *members; /* the set members */ - void *extensions; /* MAC + data extensions */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + unsigned char extensions[0] /* MAC + data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { + unsigned char ether[ETH_ALEN] __aligned(2); u16 id; - unsigned char *ether; + u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; -} __attribute__ ((aligned)); +} __aligned(__alignof__(u64)); static inline u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) @@ -72,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip) return ip - m->first_ip; } -static inline struct bitmap_ipmac_elem * -get_elem(void *extensions, u16 id, size_t dsize) -{ - return (struct bitmap_ipmac_elem *)(extensions + id * dsize); -} +#define get_elem(extensions, id, dsize) \ + (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) + +#define get_const_elem(extensions, id, dsize) \ + (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ @@ -88,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, if (!test_bit(e->id, map->members)) return 0; - elem = get_elem(map->extensions, e->id, dsize); - if (elem->filled == MAC_FILLED) - return !e->ether || - ether_addr_equal(e->ether, elem->ether); + elem = get_const_elem(map->extensions, e->id, dsize); + if (e->add_mac && elem->filled == MAC_FILLED) + return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } @@ -103,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) if (!test_bit(id, map->members)) return 0; - elem = get_elem(map->extensions, id, dsize); + elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } @@ -133,7 +134,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout, * and we can reuse it later when MAC is filled out, * possibly by the kernel */ - if (e->ether) + if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; @@ -150,7 +151,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, elem = get_elem(map->extensions, e->id, dsize); if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { - if (e->ether && + if (e->add_mac && (flags & IPSET_FLAG_EXIST) && !ether_addr_equal(e->ether, elem->ether)) { /* memcpy isn't atomic */ @@ -159,7 +160,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; - } else if (!e->ether) + } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ @@ -168,7 +169,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; - } else if (e->ether) { + } else if (e->add_mac) { /* We can store MAC too */ ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; @@ -191,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = - get_elem(map->extensions, id, dsize); + get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || @@ -213,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct bitmap_ipmac_adt_elem e = { .id = 0 }; + struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; @@ -231,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; e.id = ip_to_id(map, ip); - e.ether = eth_hdr(skb)->h_source; + memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -265,11 +266,10 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); - if (tb[IPSET_ATTR_ETHER]) - e.ether = nla_data(tb[IPSET_ATTR_ETHER]); - else - e.ether = NULL; - + if (tb[IPSET_ATTR_ETHER]) { + memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); + e.add_mac = 1; + } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; @@ -300,13 +300,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; @@ -361,14 +354,15 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; - map = kzalloc(sizeof(*map), GFP_KERNEL); + set->dsize = ip_set_elem_len(set, tb, + sizeof(struct bitmap_ipmac_elem), + __alignof__(struct bitmap_ipmac_elem)); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = bitmap_bytes(0, elements - 1); set->variant = &bitmap_ipmac; - set->dsize = ip_set_elem_len(set, tb, - sizeof(struct bitmap_ipmac_elem)); if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 5338ccd5da46..7f0c733358a4 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port"); /* Type structure */ struct bitmap_port { void *members; /* the set members */ - void *extensions; /* data extensions */ u16 first_port; /* host byte order, included in range */ u16 last_port; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + unsigned char extensions[0] /* data extensions */ + __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ @@ -209,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->members = ip_set_alloc(map->memsize); if (!map->members) return false; - if (set->dsize) { - map->extensions = ip_set_alloc(set->dsize * map->elements); - if (!map->extensions) { - kfree(map->members); - return false; - } - } map->first_port = first_port; map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; @@ -232,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], { struct bitmap_port *map; u16 first_port, last_port; + u32 elements; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -248,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], last_port = tmp; } - map = kzalloc(sizeof(*map), GFP_KERNEL); + elements = last_port - first_port + 1; + set->dsize = ip_set_elem_len(set, tb, 0, 0); + map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; - map->elements = last_port - first_port + 1; + map->elements = elements; map->memsize = bitmap_bytes(0, map->elements); set->variant = &bitmap_port; - set->dsize = ip_set_elem_len(set, tb, 0); if (!init_map_port(set, map, first_port, last_port)) { kfree(map); return -ENOMEM; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 69ab9c2634e1..95db43fc0303 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -364,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[]) } size_t -ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len) +ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, + size_t align) { enum ip_set_ext_id id; - size_t offset = len; u32 cadt_flags = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_WITH_FORCEADD) set->flags |= IPSET_CREATE_FLAG_FORCEADD; + if (!align) + align = 1; for (id = 0; id < IPSET_EXT_ID_MAX; id++) { if (!add_extension(id, cadt_flags, tb)) continue; - offset = ALIGN(offset, ip_set_extensions[id].align); - set->offset[id] = offset; + len = ALIGN(len, ip_set_extensions[id].align); + set->offset[id] = len; set->extensions |= ip_set_extensions[id].type; - offset += ip_set_extensions[id].len; + len += ip_set_extensions[id].len; } - return offset; + return ALIGN(len, align); } EXPORT_SYMBOL_GPL(ip_set_elem_len); @@ -823,20 +825,17 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, return 0; } -static int -ip_set_none(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { return -EOPNOTSUPP; } -static int -ip_set_create(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_create(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct net *net = sock_net(ctnl); struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; @@ -974,12 +973,11 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } -static int -ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_destroy(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -1050,12 +1048,11 @@ ip_set_flush_set(struct ip_set *set) spin_unlock_bh(&set->lock); } -static int -ip_set_flush(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *s; ip_set_id_t i; @@ -1090,12 +1087,11 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; -static int -ip_set_rename(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_rename(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; @@ -1140,12 +1136,11 @@ out: * so the ip_set_list always contains valid pointers to the sets. */ -static int -ip_set_swap(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; @@ -1411,10 +1406,9 @@ out: return ret < 0 ? ret : skb->len; } -static int -ip_set_dump(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; @@ -1498,12 +1492,11 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, return ret; } -static int -ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; @@ -1553,12 +1546,11 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, return ret; } -static int -ip_set_udel(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; const struct nlattr *nla; @@ -1608,12 +1600,11 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb, return ret; } -static int -ip_set_utest(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; @@ -1644,12 +1635,11 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb, /* Get headed data of a set */ -static int -ip_set_header(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_header(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl)); + struct ip_set_net *inst = ip_set_pernet(net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1701,10 +1691,9 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; -static int -ip_set_type(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1760,10 +1749,9 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; -static int -ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) +static int ip_set_protocol(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 691b54fcaf2a..e5336ab36d67 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -72,8 +72,9 @@ struct hbucket { DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ - unsigned char value[0]; /* the array of the values */ -} __attribute__ ((aligned)); + unsigned char value[0] /* the array of the values */ + __aligned(__alignof__(u64)); +}; /* The hash table: the table size stored here in order to make resizing easy */ struct htable { @@ -475,7 +476,7 @@ static void mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) { struct htable *t; - struct hbucket *n; + struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; #ifdef IP_SET_HASH_WITH_NETS @@ -510,9 +511,14 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) } } if (d >= AHASH_INIT_SIZE) { - struct hbucket *tmp = kzalloc(sizeof(*tmp) + - (n->size - AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); + if (d >= n->size) { + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + continue; + } + tmp = kzalloc(sizeof(*tmp) + + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) /* Still try to delete expired elements */ continue; @@ -522,7 +528,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize) continue; data = ahash_data(n, j, dsize); memcpy(tmp->value + d * dsize, data, dsize); - set_bit(j, tmp->used); + set_bit(d, tmp->used); d++; } tmp->pos = d; @@ -1323,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif set->variant = &IPSET_TOKEN(HTYPE, 4_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 4_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem))); #ifndef IP_SET_PROTO_UNDEF } else { set->variant = &IPSET_TOKEN(HTYPE, 6_variant); set->dsize = ip_set_elem_len(set, tb, - sizeof(struct IPSET_TOKEN(HTYPE, 6_elem))); + sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)), + __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem))); } #endif if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 5a30ce6e8c90..bbede95c9f68 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -31,7 +31,7 @@ struct set_elem { struct rcu_head rcu; struct list_head list; ip_set_id_t id; -}; +} __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; @@ -618,7 +618,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], size = IP_SET_LIST_MIN_SIZE; set->variant = &set_variant; - set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem)); + set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), + __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 1e24fff53e4b..f57b4dcdb233 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1176,6 +1176,7 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + struct sock *sk; EnterFunction(11); @@ -1183,13 +1184,12 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (skb->ipvs_property) return NF_ACCEPT; + sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } @@ -1681,6 +1681,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int struct ip_vs_conn *cp; int ret, pkts; int conn_reuse_mode; + struct sock *sk; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1708,12 +1709,11 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int ip_vs_fill_iph_skb(af, skb, false, &iph); /* Bad... Do not break raw sockets */ - if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + sk = skb_to_full_sk(skb); + if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (inet && sk->sk_family == PF_INET && inet->nodefrag) + if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 010ddeec135f..d952d67f904d 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -169,7 +169,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, /* Only update csum if we really have to */ if (sctph->dest != cp->dport || payload_csum || (skb->ip_summed == CHECKSUM_PARTIAL && - !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) { + !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) { sctph->dest = cp->dport; sctp_nat_csum(skb, sctph, sctphoff); } else if (skb->ip_summed != CHECKSUM_PARTIAL) { diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index acf5c7b3f378..278927ab0948 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -596,11 +596,18 @@ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; + kuid_t root_uid; + kgid_t root_gid; proc = proc_create("nf_conntrack_expect", 0440, net->proc_net, &exp_file_ops); if (!proc) return -ENOMEM; + + root_uid = make_kuid(net->user_ns, 0); + root_gid = make_kgid(net->user_ns, 0); + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index b666959f17c0..883c691ec8d0 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> @@ -505,11 +507,11 @@ skip_nl_seq: different IP address. Simply don't record it for NAT. */ if (cmd.l3num == PF_INET) { - pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n", + pr_debug("NOT RECORDING: %pI4 != %pI4\n", &cmd.u3.ip, &ct->tuplehash[dir].tuple.src.u3.ip); } else { - pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n", + pr_debug("NOT RECORDING: %pI6 != %pI6\n", cmd.u3.ip6, ct->tuplehash[dir].tuple.src.u3.ip6); } @@ -586,8 +588,7 @@ static void nf_conntrack_ftp_fini(void) if (ftp[i][j].me == NULL) continue; - pr_debug("nf_ct_ftp: unregistering helper for pf: %d " - "port: %d\n", + pr_debug("unregistering helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_helper_unregister(&ftp[i][j]); } @@ -625,14 +626,12 @@ static int __init nf_conntrack_ftp_init(void) else sprintf(ftp[i][j].name, "ftp-%d", ports[i]); - pr_debug("nf_ct_ftp: registering helper for pf: %d " - "port: %d\n", + pr_debug("registering helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); ret = nf_conntrack_helper_register(&ftp[i][j]); if (ret) { - printk(KERN_ERR "nf_ct_ftp: failed to register" - " helper for pf: %d port: %d\n", - ftp[i][j].tuple.src.l3num, ports[i]); + pr_err("failed to register helper for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_ftp_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 0fd2976db7ee..8b6da2719600 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/skbuff.h> @@ -237,7 +239,7 @@ static int __init nf_conntrack_irc_init(void) int i, ret; if (max_dcc_channels < 1) { - printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); + pr_err("max_dcc_channels must not be zero\n"); return -EINVAL; } @@ -267,8 +269,7 @@ static int __init nf_conntrack_irc_init(void) ret = nf_conntrack_helper_register(&irc[i]); if (ret) { - printk(KERN_ERR "nf_ct_irc: failed to register helper " - "for pf: %u port: %u\n", + pr_err("failed to register helper for pf: %u port: %u\n", irc[i].tuple.src.l3num, ports[i]); nf_conntrack_irc_fini(); return ret; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9f5272968abb..dbb1bb3edb45 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1113,12 +1113,11 @@ static int ctnetlink_flush_conntrack(struct net *net, return 0; } -static int -ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conn *ct; @@ -1168,12 +1167,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } -static int -ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conn *ct; @@ -1330,10 +1328,10 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, true); } -static int -ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -1352,10 +1350,10 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, false); } -static int -ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -1865,12 +1863,11 @@ err1: return ERR_PTR(err); } -static int -ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2034,10 +2031,10 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int -ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -2080,10 +2077,9 @@ nlmsg_failure: return -1; } -static int -ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; @@ -2729,12 +2725,12 @@ out: return skb->len; } -static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, +static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]) { int err; - struct net *net = sock_net(ctnl); struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; @@ -2768,12 +2764,10 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, return err; } -static int -ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct sk_buff *skb2; @@ -2784,7 +2778,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) - return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda); + return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, @@ -2850,12 +2844,10 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } -static int -ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -3136,12 +3128,10 @@ err_ct: return err; } -static int -ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - struct net *net = sock_net(ctnl); struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -3242,10 +3232,10 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int -ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 4a2134fd3fcb..7523a575f6d1 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -17,6 +17,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> @@ -120,14 +122,14 @@ static int help(struct sk_buff *skb, ct_sane_info->state = SANE_STATE_NORMAL; if (datalen < sizeof(struct sane_reply_net_start)) { - pr_debug("nf_ct_sane: NET_START reply too short\n"); + pr_debug("NET_START reply too short\n"); goto out; } reply = sb_ptr; if (reply->status != htonl(SANE_STATUS_SUCCESS)) { /* saned refused the command */ - pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n", + pr_debug("unsuccessful SANE_STATUS = %u\n", ntohl(reply->status)); goto out; } @@ -148,7 +150,7 @@ static int help(struct sk_buff *skb, &tuple->src.u3, &tuple->dst.u3, IPPROTO_TCP, NULL, &reply->port); - pr_debug("nf_ct_sane: expect: "); + pr_debug("expect: "); nf_ct_dump_tuple(&exp->tuple); /* Can't expect this? Best to drop packet now. */ @@ -178,8 +180,7 @@ static void nf_conntrack_sane_fini(void) for (i = 0; i < ports_c; i++) { for (j = 0; j < 2; j++) { - pr_debug("nf_ct_sane: unregistering helper for pf: %d " - "port: %d\n", + pr_debug("unregistering helper for pf: %d port: %d\n", sane[i][j].tuple.src.l3num, ports[i]); nf_conntrack_helper_unregister(&sane[i][j]); } @@ -216,14 +217,12 @@ static int __init nf_conntrack_sane_init(void) else sprintf(sane[i][j].name, "sane-%d", ports[i]); - pr_debug("nf_ct_sane: registering helper for pf: %d " - "port: %d\n", + pr_debug("registering helper for pf: %d port: %d\n", sane[i][j].tuple.src.l3num, ports[i]); ret = nf_conntrack_helper_register(&sane[i][j]); if (ret) { - printk(KERN_ERR "nf_ct_sane: failed to " - "register helper for pf: %d port: %d\n", - sane[i][j].tuple.src.l3num, ports[i]); + pr_err("failed to register helper for pf: %d port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); nf_conntrack_sane_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 885b4aba3695..3e06402739e0 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/ctype.h> #include <linux/skbuff.h> @@ -1665,8 +1667,7 @@ static int __init nf_conntrack_sip_init(void) ret = nf_conntrack_helper_register(&sip[i][j]); if (ret) { - printk(KERN_ERR "nf_ct_sip: failed to register" - " helper for pf: %u port: %u\n", + pr_err("failed to register helper for pf: %u port: %u\n", sip[i][j].tuple.src.l3num, ports[i]); nf_conntrack_sip_fini(); return ret; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 1fb3cacc04e1..0f1a45bcacb2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -392,11 +392,18 @@ static const struct file_operations ct_cpu_seq_fops = { static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; + kuid_t root_uid; + kgid_t root_gid; pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops); if (!pde) goto out_nf_conntrack; + root_uid = make_kuid(net->user_ns, 0); + root_gid = make_kgid(net->user_ns, 0); + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(pde, root_uid, root_gid); + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, &ct_cpu_seq_fops); if (!pde) diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index e68ab4fbd71f..36f964066461 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -5,6 +5,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/in.h> @@ -138,9 +140,8 @@ static int __init nf_conntrack_tftp_init(void) ret = nf_conntrack_helper_register(&tftp[i][j]); if (ret) { - printk(KERN_ERR "nf_ct_tftp: failed to register" - " helper for pf: %u port: %u\n", - tftp[i][j].tuple.src.l3num, ports[i]); + pr_err("failed to register helper for pf: %u port: %u\n", + tftp[i][j].tuple.src.l3num, ports[i]); nf_conntrack_tftp_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 93da609d9d29..26e742006c48 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -25,7 +25,7 @@ #include <net/netfilter/nf_conntrack_timeout.h> struct ctnl_timeout * -(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly; +(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c new file mode 100644 index 000000000000..8414ee1a0319 --- /dev/null +++ b/net/netfilter/nf_dup_netdev.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif) +{ + struct net_device *dev; + struct sk_buff *skb; + + dev = dev_get_by_index_rcu(pkt->net, oif); + if (dev == NULL) + return; + + skb = skb_clone(pkt->skb, GFP_ATOMIC); + if (skb == NULL) + return; + + if (skb_mac_header_was_set(skb)) + skb_push(skb, skb->mac_len); + + skb->dev = dev; + skb_sender_cpu_clear(skb); + dev_queue_xmit(skb); +} +EXPORT_SYMBOL_GPL(nf_dup_netdev_egress); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 93cc4737018f..2011977cd79d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -41,6 +41,8 @@ int nft_register_afinfo(struct net *net, struct nft_af_info *afi) } EXPORT_SYMBOL_GPL(nft_register_afinfo); +static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi); + /** * nft_unregister_afinfo - unregister nf_tables address family info * @@ -48,9 +50,10 @@ EXPORT_SYMBOL_GPL(nft_register_afinfo); * * Unregister the address family for use with nf_tables. */ -void nft_unregister_afinfo(struct nft_af_info *afi) +void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi) { nfnl_lock(NFNL_SUBSYS_NFTABLES); + __nft_release_afinfo(net, afi); list_del_rcu(&afi->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } @@ -89,6 +92,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) } static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, struct nft_af_info *afi, @@ -96,7 +100,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, struct nft_chain *chain, const struct nlattr * const *nla) { - ctx->net = sock_net(skb->sk); + ctx->net = net; ctx->afi = afi; ctx->table = table; ctx->chain = chain; @@ -127,8 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) +static int nft_register_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) { struct net *net = read_pnet(&basechain->pnet); @@ -137,10 +141,9 @@ int nft_register_basechain(struct nft_base_chain *basechain, return nf_register_net_hooks(net, basechain->ops, hook_nops); } -EXPORT_SYMBOL_GPL(nft_register_basechain); -void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops) +static void nft_unregister_basechain(struct nft_base_chain *basechain, + unsigned int hook_nops) { struct net *net = read_pnet(&basechain->pnet); @@ -149,7 +152,6 @@ void nft_unregister_basechain(struct nft_base_chain *basechain, nf_unregister_net_hooks(net, basechain->ops, hook_nops); } -EXPORT_SYMBOL_GPL(nft_unregister_basechain); static int nf_tables_register_hooks(const struct nft_table *table, struct nft_chain *chain, @@ -541,15 +543,14 @@ done: return skb->len; } -static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_gettable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_af_info *afi; const struct nft_table *table; struct sk_buff *skb2; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; int err; @@ -672,15 +673,14 @@ err: return ret; } -static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; @@ -706,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -730,7 +730,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->sets); table->flags = flags; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err3; @@ -810,18 +810,17 @@ out: return err; } -static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_deltable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) return nft_flush(&ctx, family); @@ -832,8 +831,6 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; ctx.afi = afi; ctx.table = table; @@ -1099,8 +1096,8 @@ done: return skb->len; } -static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_getchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1108,7 +1105,6 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, const struct nft_table *table; const struct nft_chain *chain; struct sk_buff *skb2; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; int err; @@ -1221,8 +1217,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) } } -static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1232,7 +1228,6 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1313,7 +1308,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(stats); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); if (trans == NULL) { @@ -1461,7 +1456,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (err < 0) goto err1; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1476,15 +1471,14 @@ err1: return err; } -static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; @@ -1495,18 +1489,14 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_CHAIN_INACTIVE) - return -ENOENT; if (chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); return nft_delchain(&ctx); } @@ -1931,8 +1921,8 @@ done: return skb->len; } -static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_getrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1941,7 +1931,6 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, const struct nft_chain *chain; const struct nft_rule *rule; struct sk_buff *skb2; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; int err; @@ -2010,13 +1999,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static struct nft_expr_info *info; -static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2075,7 +2063,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); n = 0; size = 0; @@ -2176,13 +2164,12 @@ err1: return err; } -static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; @@ -2196,8 +2183,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; if (nla[NFTA_RULE_CHAIN]) { chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]); @@ -2205,7 +2190,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(chain); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2338,18 +2323,19 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_ID] = { .type = NLA_U32 }, [NFTA_SET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, + [NFTA_SET_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; struct nft_table *table = NULL; @@ -2367,11 +2353,9 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); - if (table->flags & NFT_TABLE_INACTIVE) - return -ENOENT; } - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -2500,6 +2484,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; } + if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) + goto nla_put_failure; + desc = nla_nest_start(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; @@ -2619,8 +2606,8 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) return 0; } -static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_getset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nft_set *set; @@ -2630,7 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2693,14 +2680,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -2710,6 +2696,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, u64 timeout; u32 ktype, dtype, flags, policy, gc_int; struct nft_set_desc desc; + unsigned char *udata; + u16 udlen; int err; if (nla[NFTA_SET_TABLE] == NULL || @@ -2798,7 +2786,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) { @@ -2822,12 +2810,16 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(ops)) return PTR_ERR(ops); + udlen = 0; + if (nla[NFTA_SET_USERDATA]) + udlen = nla_len(nla[NFTA_SET_USERDATA]); + size = 0; if (ops->privsize != NULL) size = ops->privsize(nla); err = -ENOMEM; - set = kzalloc(sizeof(*set) + size, GFP_KERNEL); + set = kzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); if (set == NULL) goto err1; @@ -2836,6 +2828,12 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (err < 0) goto err2; + udata = NULL; + if (udlen) { + udata = set->data + size; + nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen); + } + INIT_LIST_HEAD(&set->bindings); write_pnet(&set->pnet, net); set->ops = ops; @@ -2846,6 +2844,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, set->flags = flags; set->size = desc.size; set->policy = policy; + set->udlen = udlen; + set->udata = udata; set->timeout = timeout; set->gc_int = gc_int; @@ -2882,8 +2882,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set nft_set_destroy(set); } -static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2896,15 +2896,13 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) return PTR_ERR(set); - if (set->flags & NFT_SET_INACTIVE) - return -ENOENT; if (!list_empty(&set->bindings)) return -EBUSY; @@ -3024,16 +3022,14 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - bool trans) + const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); if (IS_ERR(afi)) @@ -3042,10 +3038,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]); if (IS_ERR(table)) return PTR_ERR(table); - if (!trans && (table->flags & NFT_TABLE_INACTIVE)) - return -ENOENT; - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -3135,6 +3129,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3150,10 +3145,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, - false); + err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, + (void *)nla); if (err < 0) return err; + if (ctx.table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3208,17 +3205,19 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; + if (ctx.table->flags & NFT_TABLE_INACTIVE) + return -ENOENT; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]); if (IS_ERR(set)) @@ -3528,11 +3527,10 @@ err1: return err; } -static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3541,7 +3539,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -3623,8 +3621,8 @@ err1: return err; } -static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nlattr *attr; @@ -3635,7 +3633,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -3739,11 +3737,10 @@ err: return err; } -static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_getgen(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); struct sk_buff *skb2; int err; @@ -3887,9 +3884,8 @@ static void nf_tables_commit_release(struct nft_trans *trans) kfree(trans); } -static int nf_tables_commit(struct sk_buff *skb) +static int nf_tables_commit(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -4024,13 +4020,13 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int nf_tables_abort(struct sk_buff *skb) +static int nf_tables_abort(struct net *net, struct sk_buff *skb) { - struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; struct nft_trans_elem *te; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { @@ -4446,22 +4442,22 @@ static void nft_verdict_uninit(const struct nft_data *data) } } -static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data) +int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v) { struct nlattr *nest; - nest = nla_nest_start(skb, NFTA_DATA_VERDICT); + nest = nla_nest_start(skb, type); if (!nest) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict.code))) + if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code))) goto nla_put_failure; - switch (data->verdict.code) { + switch (v->code) { case NFT_JUMP: case NFT_GOTO: if (nla_put_string(skb, NFTA_VERDICT_CHAIN, - data->verdict.chain->name)) + v->chain->name)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -4572,7 +4568,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, err = nft_value_dump(skb, data, len); break; case NFT_DATA_VERDICT: - err = nft_verdict_dump(skb, data); + err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict); break; default: err = -EINVAL; @@ -4584,7 +4580,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -static int nf_tables_init_net(struct net *net) +static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.af_info); INIT_LIST_HEAD(&net->nft.commit_list); @@ -4592,6 +4588,67 @@ static int nf_tables_init_net(struct net *net) return 0; } +int __nft_release_basechain(struct nft_ctx *ctx) +{ + struct nft_rule *rule, *nr; + + BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN)); + + nf_tables_unregister_hooks(ctx->chain->table, ctx->chain, + ctx->afi->nops); + list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { + list_del(&rule->list); + ctx->chain->use--; + nf_tables_rule_destroy(ctx, rule); + } + list_del(&ctx->chain->list); + ctx->table->use--; + nf_tables_chain_destroy(ctx->chain); + + return 0; +} +EXPORT_SYMBOL_GPL(__nft_release_basechain); + +/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */ +static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) +{ + struct nft_table *table, *nt; + struct nft_chain *chain, *nc; + struct nft_rule *rule, *nr; + struct nft_set *set, *ns; + struct nft_ctx ctx = { + .net = net, + .afi = afi, + }; + + list_for_each_entry_safe(table, nt, &afi->tables, list) { + list_for_each_entry(chain, &table->chains, list) + nf_tables_unregister_hooks(table, chain, afi->nops); + /* No packets are walking on these chains anymore. */ + ctx.table = table; + list_for_each_entry(chain, &table->chains, list) { + ctx.chain = chain; + list_for_each_entry_safe(rule, nr, &chain->rules, list) { + list_del(&rule->list); + chain->use--; + nf_tables_rule_destroy(&ctx, rule); + } + } + list_for_each_entry_safe(set, ns, &table->sets, list) { + list_del(&set->list); + table->use--; + nft_set_destroy(set); + } + list_for_each_entry_safe(chain, nc, &table->chains, list) { + list_del(&chain->list); + table->use--; + nf_tables_chain_destroy(chain); + } + list_del(&table->list); + nf_tables_table_destroy(&ctx); + } +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, }; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index f3695a497408..e9f8dffcc244 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -16,22 +16,17 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/netfilter.h> +#include <linux/static_key.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> -enum nft_trace { - NFT_TRACE_RULE, - NFT_TRACE_RETURN, - NFT_TRACE_POLICY, -}; - -static const char *const comments[] = { - [NFT_TRACE_RULE] = "rule", - [NFT_TRACE_RETURN] = "return", - [NFT_TRACE_POLICY] = "policy", +static const char *const comments[__NFT_TRACETYPE_MAX] = { + [NFT_TRACETYPE_POLICY] = "policy", + [NFT_TRACETYPE_RETURN] = "return", + [NFT_TRACETYPE_RULE] = "rule", }; static struct nf_loginfo trace_loginfo = { @@ -44,22 +39,36 @@ static struct nf_loginfo trace_loginfo = { }, }; -static void __nft_trace_packet(const struct nft_pktinfo *pkt, - const struct nft_chain *chain, - int rulenum, enum nft_trace type) +static noinline void __nft_trace_packet(struct nft_traceinfo *info, + const struct nft_chain *chain, + int rulenum, enum nft_trace_types type) { + const struct nft_pktinfo *pkt = info->pkt; + + if (!info->trace || !pkt->skb->nf_trace) + return; + + info->chain = chain; + info->type = type; + + nft_trace_notify(info); + nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in, pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", chain->table->name, chain->name, comments[type], rulenum); } -static inline void nft_trace_packet(const struct nft_pktinfo *pkt, +static inline void nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, - int rulenum, enum nft_trace type) + const struct nft_rule *rule, + int rulenum, + enum nft_trace_types type) { - if (unlikely(pkt->skb->nf_trace)) - __nft_trace_packet(pkt, chain, rulenum, type); + if (static_branch_unlikely(&nft_trace_enabled)) { + info->rule = rule; + __nft_trace_packet(info, chain, rulenum, type); + } } static void nft_cmp_fast_eval(const struct nft_expr *expr, @@ -121,7 +130,11 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) struct nft_stats *stats; int rulenum; unsigned int gencursor = nft_genmask_cur(net); + struct nft_traceinfo info; + info.trace = false; + if (static_branch_unlikely(&nft_trace_enabled)) + nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); @@ -151,7 +164,8 @@ next_rule: regs.verdict.code = NFT_CONTINUE; continue; case NFT_CONTINUE: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); continue; } break; @@ -161,7 +175,8 @@ next_rule: case NF_ACCEPT: case NF_DROP: case NF_QUEUE: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); return regs.verdict.code; } @@ -174,7 +189,8 @@ next_rule: stackptr++; /* fall through */ case NFT_GOTO: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RULE); chain = regs.verdict.chain; goto do_chain; @@ -182,7 +198,8 @@ next_rule: rulenum++; /* fall through */ case NFT_RETURN: - nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN); + nft_trace_packet(&info, chain, rule, + rulenum, NFT_TRACETYPE_RETURN); break; default: WARN_ON(1); @@ -196,7 +213,8 @@ next_rule: goto next_rule; } - nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY); + nft_trace_packet(&info, basechain, NULL, -1, + NFT_TRACETYPE_POLICY); rcu_read_lock_bh(); stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats)); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index 9dd2d216cfc1..6b5f76295d3d 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -57,7 +57,7 @@ err: static void __net_exit nf_tables_inet_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.inet); + nft_unregister_afinfo(net, net->nft.inet); kfree(net->nft.inet); } diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 7b9c053ba750..b6605e000801 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -94,7 +94,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, { struct nft_pktinfo pkt; - switch (eth_hdr(skb)->h_proto) { + switch (skb->protocol) { case htons(ETH_P_IP): nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); break; @@ -139,7 +139,7 @@ err: static void nf_tables_netdev_exit_net(struct net *net) { - nft_unregister_afinfo(net->nft.netdev); + nft_unregister_afinfo(net, net->nft.netdev); kfree(net->nft.netdev); } @@ -156,35 +156,17 @@ static const struct nf_chain_type nft_filter_chain_netdev = { .hook_mask = (1 << NF_NETDEV_INGRESS), }; -static void nft_netdev_event(unsigned long event, struct nft_af_info *afi, - struct net_device *dev, struct nft_table *table, - struct nft_base_chain *basechain) +static void nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_ctx *ctx) { - switch (event) { - case NETDEV_REGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; + struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED)); - - dev_hold(dev); - basechain->ops[0].dev = dev; - basechain->flags &= ~NFT_BASECHAIN_DISABLED; - if (!(table->flags & NFT_TABLE_F_DORMANT)) - nft_register_basechain(basechain, afi->nops); - break; + switch (event) { case NETDEV_UNREGISTER: if (strcmp(basechain->dev_name, dev->name) != 0) return; - BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED); - - if (!(table->flags & NFT_TABLE_F_DORMANT)) - nft_unregister_basechain(basechain, afi->nops); - - dev_put(basechain->ops[0].dev); - basechain->ops[0].dev = NULL; - basechain->flags |= NFT_BASECHAIN_DISABLED; + __nft_release_basechain(ctx); break; case NETDEV_CHANGENAME: if (dev->ifindex != basechain->ops[0].dev->ifindex) @@ -201,20 +183,29 @@ static int nf_tables_netdev_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_af_info *afi; struct nft_table *table; - struct nft_chain *chain; + struct nft_chain *chain, *nr; + struct nft_ctx ctx = { + .net = dev_net(dev), + }; + + if (event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; nfnl_lock(NFNL_SUBSYS_NFTABLES); list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) { + ctx.afi = afi; if (afi->family != NFPROTO_NETDEV) continue; list_for_each_entry(table, &afi->tables, list) { - list_for_each_entry(chain, &table->chains, list) { + ctx.table = table; + list_for_each_entry_safe(chain, nr, &table->chains, list) { if (!(chain->flags & NFT_BASE_CHAIN)) continue; - nft_netdev_event(event, afi, dev, table, - nft_base_chain(chain)); + ctx.chain = chain; + nft_netdev_event(event, dev, &ctx); } } } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c new file mode 100644 index 000000000000..e9e959f65d91 --- /dev/null +++ b/net/netfilter/nf_tables_trace.c @@ -0,0 +1,275 @@ +/* + * (C) 2015 Red Hat GmbH + * Author: Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/static_key.h> +#include <linux/hash.h> +#include <linux/jhash.h> +#include <linux/if_vlan.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +#define NFT_TRACETYPE_LL_HSIZE 20 +#define NFT_TRACETYPE_NETWORK_HSIZE 40 +#define NFT_TRACETYPE_TRANSPORT_HSIZE 20 + +DEFINE_STATIC_KEY_FALSE(nft_trace_enabled); +EXPORT_SYMBOL_GPL(nft_trace_enabled); + +static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) +{ + __be32 id; + + /* using skb address as ID results in a limited number of + * values (and quick reuse). + * + * So we attempt to use as many skb members that will not + * change while skb is with netfilter. + */ + id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), + skb->skb_iif); + + return nla_put_be32(nlskb, NFTA_TRACE_ID, id); +} + +static int trace_fill_header(struct sk_buff *nlskb, u16 type, + const struct sk_buff *skb, + int off, unsigned int len) +{ + struct nlattr *nla; + + if (len == 0) + return 0; + + nla = nla_reserve(nlskb, type, len); + if (!nla || skb_copy_bits(skb, off, nla_data(nla), len)) + return -1; + + return 0; +} + +static int nf_trace_fill_ll_header(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + struct vlan_ethhdr veth; + int off; + + BUILD_BUG_ON(sizeof(veth) > NFT_TRACETYPE_LL_HSIZE); + + off = skb_mac_header(skb) - skb->data; + if (off != -ETH_HLEN) + return -1; + + if (skb_copy_bits(skb, off, &veth, ETH_HLEN)) + return -1; + + veth.h_vlan_proto = skb->vlan_proto; + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth.h_vlan_encapsulated_proto = skb->protocol; + + return nla_put(nlskb, NFTA_TRACE_LL_HEADER, sizeof(veth), &veth); +} + +static int nf_trace_fill_dev_info(struct sk_buff *nlskb, + const struct net_device *indev, + const struct net_device *outdev) +{ + if (indev) { + if (nla_put_be32(nlskb, NFTA_TRACE_IIF, + htonl(indev->ifindex))) + return -1; + + if (nla_put_be16(nlskb, NFTA_TRACE_IIFTYPE, + htons(indev->type))) + return -1; + } + + if (outdev) { + if (nla_put_be32(nlskb, NFTA_TRACE_OIF, + htonl(outdev->ifindex))) + return -1; + + if (nla_put_be16(nlskb, NFTA_TRACE_OIFTYPE, + htons(outdev->type))) + return -1; + } + + return 0; +} + +static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, + const struct nft_pktinfo *pkt) +{ + const struct sk_buff *skb = pkt->skb; + unsigned int len = min_t(unsigned int, + pkt->xt.thoff - skb_network_offset(skb), + NFT_TRACETYPE_NETWORK_HSIZE); + int off = skb_network_offset(skb); + + if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) + return -1; + + len = min_t(unsigned int, skb->len - pkt->xt.thoff, + NFT_TRACETYPE_TRANSPORT_HSIZE); + + if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, + pkt->xt.thoff, len)) + return -1; + + if (!skb_mac_header_was_set(skb)) + return 0; + + if (skb_vlan_tag_get(skb)) + return nf_trace_fill_ll_header(nlskb, skb); + + off = skb_mac_header(skb) - skb->data; + len = min_t(unsigned int, -off, NFT_TRACETYPE_LL_HSIZE); + return trace_fill_header(nlskb, NFTA_TRACE_LL_HEADER, + skb, off, len); +} + +static int nf_trace_fill_rule_info(struct sk_buff *nlskb, + const struct nft_traceinfo *info) +{ + if (!info->rule) + return 0; + + /* a continue verdict with ->type == RETURN means that this is + * an implicit return (end of chain reached). + * + * Since no rule matched, the ->rule pointer is invalid. + */ + if (info->type == NFT_TRACETYPE_RETURN && + info->verdict->code == NFT_CONTINUE) + return 0; + + return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, + cpu_to_be64(info->rule->handle)); +} + +void nft_trace_notify(struct nft_traceinfo *info) +{ + const struct nft_pktinfo *pkt = info->pkt; + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + struct sk_buff *skb; + unsigned int size; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE; + + if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE)) + return; + + size = nlmsg_total_size(sizeof(struct nfgenmsg)) + + nla_total_size(NFT_TABLE_MAXNAMELEN) + + nla_total_size(NFT_CHAIN_MAXNAMELEN) + + nla_total_size(sizeof(__be64)) + /* rule handle */ + nla_total_size(sizeof(__be32)) + /* trace type */ + nla_total_size(0) + /* VERDICT, nested */ + nla_total_size(sizeof(u32)) + /* verdict code */ + nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */ + nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(NFT_TRACETYPE_LL_HSIZE) + + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + + nla_total_size(sizeof(u32)) + /* iif */ + nla_total_size(sizeof(__be16)) + /* iiftype */ + nla_total_size(sizeof(u32)) + /* oif */ + nla_total_size(sizeof(__be16)) + /* oiftype */ + nla_total_size(sizeof(u32)) + /* mark */ + nla_total_size(sizeof(u32)) + /* nfproto */ + nla_total_size(sizeof(u32)); /* policy */ + + skb = nlmsg_new(size, GFP_ATOMIC); + if (!skb) + return; + + nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + if (!nlh) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = info->basechain->type->family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf))) + goto nla_put_failure; + + if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) + goto nla_put_failure; + + if (trace_fill_id(skb, pkt->skb)) + goto nla_put_failure; + + if (info->chain) { + if (nla_put_string(skb, NFTA_TRACE_CHAIN, + info->chain->name)) + goto nla_put_failure; + if (nla_put_string(skb, NFTA_TRACE_TABLE, + info->chain->table->name)) + goto nla_put_failure; + } + + if (nf_trace_fill_rule_info(skb, info)) + goto nla_put_failure; + + switch (info->type) { + case NFT_TRACETYPE_UNSPEC: + case __NFT_TRACETYPE_MAX: + break; + case NFT_TRACETYPE_RETURN: + case NFT_TRACETYPE_RULE: + if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) + goto nla_put_failure; + break; + case NFT_TRACETYPE_POLICY: + if (nla_put_be32(skb, NFTA_TRACE_POLICY, + info->basechain->policy)) + goto nla_put_failure; + break; + } + + if (pkt->skb->mark && + nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + goto nla_put_failure; + + if (!info->packet_dumped) { + if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out)) + goto nla_put_failure; + + if (nf_trace_fill_pkt_info(skb, pkt)) + goto nla_put_failure; + info->packet_dumped = true; + } + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC); + return; + + nla_put_failure: + WARN_ON_ONCE(1); + kfree_skb(skb); +} + +void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_chain *chain) +{ + info->basechain = nft_base_chain(chain); + info->trace = true; + info->packet_dumped = false; + info->pkt = pkt; + info->verdict = verdict; +} diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 46453ab318db..a7ba23353dab 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -33,6 +33,10 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); +#define nfnl_dereference_protected(id) \ + rcu_dereference_protected(table[(id)].subsys, \ + lockdep_nfnl_is_held((id))) + static char __initdata nfversion[] = "0.30"; static struct { @@ -49,6 +53,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, + [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; void nfnl_lock(__u8 subsys_id) @@ -201,19 +206,18 @@ replay: } if (nc->call_rcu) { - err = nc->call_rcu(net->nfnl, skb, nlh, + err = nc->call_rcu(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); rcu_read_unlock(); } else { rcu_read_unlock(); nfnl_lock(subsys_id); - if (rcu_dereference_protected(table[subsys_id].subsys, - lockdep_is_held(&table[subsys_id].mutex)) != ss || + if (nfnl_dereference_protected(subsys_id) != ss || nfnetlink_find_client(type, ss) != nc) err = -EAGAIN; else if (nc->call) - err = nc->call(net->nfnl, skb, nlh, - (const struct nlattr **)cda); + err = nc->call(net, net->nfnl, skb, nlh, + (const struct nlattr **)cda); else err = -EINVAL; nfnl_unlock(subsys_id); @@ -295,18 +299,14 @@ replay: if (!skb) return netlink_ack(oskb, nlh, -ENOMEM); - skb->sk = oskb->sk; - nfnl_lock(subsys_id); - ss = rcu_dereference_protected(table[subsys_id].subsys, - lockdep_is_held(&table[subsys_id].mutex)); + ss = nfnl_dereference_protected(subsys_id); if (!ss) { #ifdef CONFIG_MODULES nfnl_unlock(subsys_id); request_module("nfnetlink-subsys-%d", subsys_id); nfnl_lock(subsys_id); - ss = rcu_dereference_protected(table[subsys_id].subsys, - lockdep_is_held(&table[subsys_id].mutex)); + ss = nfnl_dereference_protected(subsys_id); if (!ss) #endif { @@ -381,7 +381,7 @@ replay: goto ack; if (nc->call_batch) { - err = nc->call_batch(net->nfnl, skb, nlh, + err = nc->call_batch(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); } @@ -425,15 +425,15 @@ next: } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(oskb); + ss->abort(net, oskb); nfnl_err_reset(&err_list); nfnl_unlock(subsys_id); kfree_skb(skb); goto replay; } else if (status == NFNL_BATCH_DONE) { - ss->commit(oskb); + ss->commit(net, oskb); } else { - ss->abort(oskb); + ss->abort(net, oskb); } nfnl_err_deliver(&err_list, oskb); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index fefbf5f0b28d..5274b04c42a6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -46,12 +46,11 @@ struct nfacct_filter { #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ -static int -nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_acct_new(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; - struct net *net = sock_net(nfnl); char *acct_name; unsigned int size = 0; u32 flags = 0; @@ -253,11 +252,10 @@ nfacct_filter_alloc(const struct nlattr * const attr) return filter; } -static int -nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_acct_get(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { - struct net *net = sock_net(nfnl); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -333,11 +331,10 @@ static int nfnl_acct_try_del(struct nf_acct *cur) return ret; } -static int -nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_acct_del(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { - struct net *net = sock_net(nfnl); char *acct_name; struct nf_acct *cur; int ret = -ENOENT; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 54330fb5efaf..e924e95fcc7f 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -286,9 +286,9 @@ nfnl_cthelper_update(const struct nlattr * const tb[], return 0; } -static int -nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; @@ -498,9 +498,9 @@ out: return skb->len; } -static int -nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { int ret = -ENOENT, i; struct nf_conntrack_helper *cur; @@ -570,9 +570,9 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, return ret; } -static int -nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index c7a2d0e1c462..5d010f27ac01 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -38,8 +38,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); -static LIST_HEAD(cttimeout_list); - static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, .len = CTNL_TIMEOUT_NAME_MAX - 1}, @@ -67,16 +65,15 @@ ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, return ret; } -static int -cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { __u16 l3num; __u8 l4num; struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; - struct net *net = sock_net(skb->sk); char *name; int ret; @@ -90,7 +87,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - list_for_each_entry(timeout, &cttimeout_list, head) { + list_for_each_entry(timeout, &net->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -145,7 +142,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, timeout->l3num = l3num; timeout->l4proto = l4proto; atomic_set(&timeout->refcnt, 1); - list_add_tail_rcu(&timeout->head, &cttimeout_list); + list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); return 0; err: @@ -209,6 +206,7 @@ nla_put_failure: static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; if (cb->args[2]) @@ -219,7 +217,7 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &cttimeout_list, head) { + list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) { if (last) { if (cur != last) continue; @@ -240,10 +238,10 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int -cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { int ret = -ENOENT; char *name; @@ -260,7 +258,7 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &cttimeout_list, head) { + list_for_each_entry(cur, &net->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) @@ -301,17 +299,17 @@ static void untimeout(struct nf_conntrack_tuple_hash *i, RCU_INIT_POINTER(timeout_ext->timeout, NULL); } -static void ctnl_untimeout(struct ctnl_timeout *timeout) +static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) { struct nf_conntrack_tuple_hash *h; const struct hlist_nulls_node *nn; int i; local_bh_disable(); - for (i = 0; i < init_net.ct.htable_size; i++) { + for (i = 0; i < net->ct.htable_size; i++) { spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < init_net.ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode) + if (i < net->ct.htable_size) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); @@ -320,7 +318,7 @@ static void ctnl_untimeout(struct ctnl_timeout *timeout) } /* try to delete object, fail if it is still in use. */ -static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) +static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; @@ -329,7 +327,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); - ctnl_untimeout(timeout); + ctnl_untimeout(net, timeout); kfree_rcu(timeout, rcu_head); } else { /* still in use, restore reference counter. */ @@ -339,28 +337,28 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) return ret; } -static int -cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { - char *name; struct ctnl_timeout *cur; int ret = -ENOENT; + char *name; if (!cda[CTA_TIMEOUT_NAME]) { - list_for_each_entry(cur, &cttimeout_list, head) - ctnl_timeout_try_del(cur); + list_for_each_entry(cur, &net->nfct_timeout_list, head) + ctnl_timeout_try_del(net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &cttimeout_list, head) { + list_for_each_entry(cur, &net->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - ret = ctnl_timeout_try_del(cur); + ret = ctnl_timeout_try_del(net, cur); if (ret < 0) return ret; @@ -369,15 +367,14 @@ cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, return ret; } -static int -cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) +static int cttimeout_default_set(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) { __u16 l3num; __u8 l4num; struct nf_conntrack_l4proto *l4proto; - struct net *net = sock_net(skb->sk); unsigned int *timeouts; int ret; @@ -459,14 +456,14 @@ nla_put_failure: return -1; } -static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb, +static int cttimeout_default_get(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]) { __u16 l3num; __u8 l4num; struct nf_conntrack_l4proto *l4proto; - struct net *net = sock_net(skb->sk); struct sk_buff *skb2; int ret, err; @@ -511,12 +508,13 @@ err: } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -static struct ctnl_timeout *ctnl_timeout_find_get(const char *name) +static struct ctnl_timeout * +ctnl_timeout_find_get(struct net *net, const char *name) { struct ctnl_timeout *timeout, *matching = NULL; rcu_read_lock(); - list_for_each_entry_rcu(timeout, &cttimeout_list, head) { + list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -569,10 +567,39 @@ static const struct nfnetlink_subsystem cttimeout_subsys = { MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); +static int __net_init cttimeout_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->nfct_timeout_list); + + return 0; +} + +static void __net_exit cttimeout_net_exit(struct net *net) +{ + struct ctnl_timeout *cur, *tmp; + + ctnl_untimeout(net, NULL); + + list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { + list_del_rcu(&cur->head); + nf_ct_l4proto_put(cur->l4proto); + kfree_rcu(cur, rcu_head); + } +} + +static struct pernet_operations cttimeout_ops = { + .init = cttimeout_net_init, + .exit = cttimeout_net_exit, +}; + static int __init cttimeout_init(void) { int ret; + ret = register_pernet_subsys(&cttimeout_ops); + if (ret < 0) + return ret; + ret = nfnetlink_subsys_register(&cttimeout_subsys); if (ret < 0) { pr_err("cttimeout_init: cannot register cttimeout with " @@ -586,28 +613,17 @@ static int __init cttimeout_init(void) return 0; err_out: + unregister_pernet_subsys(&cttimeout_ops); return ret; } static void __exit cttimeout_exit(void) { - struct ctnl_timeout *cur, *tmp; - pr_info("cttimeout: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&cttimeout_subsys); - /* Make sure no conntrack objects refer to custom timeouts anymore. */ - ctnl_untimeout(NULL); - - list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { - list_del_rcu(&cur->head); - /* We are sure that our objects have no clients at this point, - * it's safe to release them all without checking refcnt. - */ - nf_ct_l4proto_put(cur->l4proto); - kfree_rcu(cur, rcu_head); - } + unregister_pernet_subsys(&cttimeout_ops); #ifdef CONFIG_NF_CONNTRACK_TIMEOUT RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 06eb48fceb42..8ca932057c13 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -293,24 +293,20 @@ nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) return status; } -static int +static void nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) { spin_lock_bh(&inst->lock); inst->flushtimeout = timeout; spin_unlock_bh(&inst->lock); - - return 0; } -static int +static void nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) { spin_lock_bh(&inst->lock); inst->qthreshold = qthresh; spin_unlock_bh(&inst->lock); - - return 0; } static int @@ -789,10 +785,9 @@ static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; -static int -nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) +static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { return -ENOTSUPP; } @@ -813,19 +808,17 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; -static int -nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfula[]) +static int nfulnl_recv_config(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nfula[]) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t group_num = ntohs(nfmsg->res_id); struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; - struct net *net = sock_net(ctnl); struct nfnl_log_net *log = nfnl_log_pernet(net); int ret = 0; - u16 flags; + u16 flags = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; @@ -895,7 +888,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto out_put; default: ret = -ENOTSUPP; - break; + goto out_put; } } else if (!inst) { ret = -ENODEV; @@ -1064,15 +1057,26 @@ static int __net_init nfnl_log_net_init(struct net *net) { unsigned int i; struct nfnl_log_net *log = nfnl_log_pernet(net); +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; + kuid_t root_uid; + kgid_t root_gid; +#endif for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&log->instance_table[i]); spin_lock_init(&log->instances_lock); #ifdef CONFIG_PROC_FS - if (!proc_create("nfnetlink_log", 0440, - net->nf.proc_netfilter, &nful_file_ops)) + proc = proc_create("nfnetlink_log", 0440, + net->nf.proc_netfilter, &nful_file_ops); + if (!proc) return -ENOMEM; + + root_uid = make_kuid(net->user_ns, 0); + root_gid = make_kgid(net->user_ns, 0); + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(proc, root_uid, root_gid); #endif return 0; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 7d81d280cb4f..1d3936587ace 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -365,8 +365,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, break; } + nfnl_ct = rcu_dereference(nfnl_ct_hook); + if (queue->flags & NFQA_CFG_F_CONNTRACK) { - nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) { ct = nfnl_ct->get_ct(entskb, &ctinfo); if (ct != NULL) @@ -956,10 +957,10 @@ static int nfq_id_after(unsigned int id, unsigned int max) return (int)(id - max) > 0; } -static int -nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) +static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_queue_entry *entry, *tmp; @@ -968,8 +969,6 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, struct nfqnl_instance *queue; LIST_HEAD(batch_list); u16 queue_num = ntohs(nfmsg->res_id); - - struct net *net = sock_net(ctnl); struct nfnl_queue_net *q = nfnl_queue_pernet(net); queue = verdict_instance_lookup(q, queue_num, @@ -1028,14 +1027,13 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, return ct; } -static int -nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) +static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict; @@ -1043,8 +1041,6 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, enum ip_conntrack_info uninitialized_var(ctinfo); struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; - - struct net *net = sock_net(ctnl); struct nfnl_queue_net *q = nfnl_queue_pernet(net); queue = instance_lookup(q, queue_num); @@ -1064,9 +1060,10 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (entry == NULL) return -ENOENT; + /* rcu lock already held from nfnl->call_rcu. */ + nfnl_ct = rcu_dereference(nfnl_ct_hook); + if (nfqa[NFQA_CT]) { - /* rcu lock already held from nfnl->call_rcu. */ - nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); } @@ -1090,10 +1087,9 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, return 0; } -static int -nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) +static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { return -ENOTSUPP; } @@ -1108,17 +1104,16 @@ static const struct nf_queue_handler nfqh = { .nf_hook_drop = &nfqnl_nf_hook_drop, }; -static int -nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) +static int nfqnl_recv_config(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; - struct net *net = sock_net(ctnl); struct nfnl_queue_net *q = nfnl_queue_pernet(net); + __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { @@ -1131,6 +1126,40 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, } } + /* Check if we support these flags in first place, dependencies should + * be there too not to break atomicity. + */ + if (nfqa[NFQA_CFG_FLAGS]) { + if (!nfqa[NFQA_CFG_MASK]) { + /* A mask is needed to specify which flags are being + * changed. + */ + return -EINVAL; + } + + flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); + mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); + + if (flags >= NFQA_CFG_F_MAX) + return -EOPNOTSUPP; + +#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (flags & mask & NFQA_CFG_F_SECCTX) + return -EOPNOTSUPP; +#endif + if ((flags & mask & NFQA_CFG_F_CONNTRACK) && + !rcu_access_pointer(nfnl_ct_hook)) { +#ifdef CONFIG_MODULES + nfnl_unlock(NFNL_SUBSYS_QUEUE); + request_module("ip_conntrack_netlink"); + nfnl_lock(NFNL_SUBSYS_QUEUE); + if (rcu_access_pointer(nfnl_ct_hook)) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + } + rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { @@ -1158,70 +1187,38 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, goto err_out_unlock; } instance_destroy(q, queue); - break; + goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; - break; + goto err_out_unlock; } } + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + if (nfqa[NFQA_CFG_PARAMS]) { - struct nfqnl_msg_config_params *params; + struct nfqnl_msg_config_params *params = + nla_data(nfqa[NFQA_CFG_PARAMS]); - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { - __be32 *queue_maxlen; + __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { - __u32 flags, mask; - - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } - - if (!nfqa[NFQA_CFG_MASK]) { - /* A mask is needed to specify which flags are being - * changed. - */ - ret = -EINVAL; - goto err_out_unlock; - } - - flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); - mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); - - if (flags >= NFQA_CFG_F_MAX) { - ret = -EOPNOTSUPP; - goto err_out_unlock; - } -#if !IS_ENABLED(CONFIG_NETWORK_SECMARK) - if (flags & mask & NFQA_CFG_F_SECCTX) { - ret = -EOPNOTSUPP; - goto err_out_unlock; - } -#endif spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; @@ -1417,6 +1414,7 @@ static int __init nfnetlink_queue_init(void) cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); + unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index fde5145f2e36..383c17138399 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -8,6 +8,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include <asm/unaligned.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -39,6 +40,27 @@ static void nft_byteorder_eval(const struct nft_expr *expr, d = (void *)dst; switch (priv->size) { + case 8: { + u64 src64; + + switch (priv->op) { + case NFT_BYTEORDER_NTOH: + for (i = 0; i < priv->len / 8; i++) { + src64 = get_unaligned_be64(&src[i]); + src64 = be64_to_cpu((__force __be64)src64); + put_unaligned_be64(src64, &dst[i]); + } + break; + case NFT_BYTEORDER_HTON: + for (i = 0; i < priv->len / 8; i++) { + src64 = get_unaligned_be64(&src[i]); + src64 = (__force u64)cpu_to_be64(src64); + put_unaligned_be64(src64, &dst[i]); + } + break; + } + break; + } case 4: switch (priv->op) { case NFT_BYTEORDER_NTOH: @@ -101,6 +123,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, switch (priv->size) { case 2: case 4: + case 8: break; default: return -EINVAL; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 9c8fab00164b..454841baa4d0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -519,9 +519,9 @@ nla_put_failure: return -1; } -static int -nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +static int nfnl_compat_get(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[]) { int ret = 0, target; struct nfgenmsg *nfmsg; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 1067fb4c1ffa..c7808fc19719 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -47,27 +47,34 @@ static void nft_counter_eval(const struct nft_expr *expr, local_bh_enable(); } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - struct nft_counter_percpu *cpu_stats; - struct nft_counter total; + const struct nft_counter_percpu *cpu_stats; u64 bytes, packets; unsigned int seq; int cpu; - memset(&total, 0, sizeof(total)); + memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - cpu_stats = per_cpu_ptr(priv->counter, cpu); + cpu_stats = per_cpu_ptr(counter, cpu); do { seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); bytes = cpu_stats->counter.bytes; packets = cpu_stats->counter.packets; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); - total.packets += packets; - total.bytes += bytes; + total->packets += packets; + total->bytes += bytes; } +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) @@ -118,6 +125,31 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, free_percpu(priv->counter); } +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(src); + struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + struct nft_counter total; + + nft_counter_fetch(priv->counter, &total); + + cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, + GFP_ATOMIC); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + this_cpu->counter.packets = total.packets; + this_cpu->counter.bytes = total.bytes; + preempt_enable(); + + priv_clone->counter = cpu_stats; + return 0; +} + static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, @@ -126,6 +158,7 @@ static const struct nft_expr_ops nft_counter_ops = { .init = nft_counter_init, .destroy = nft_counter_destroy, .dump = nft_counter_dump, + .clone = nft_counter_clone, }; static struct nft_expr_type nft_counter_type __read_mostly = { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 8cbca3432f90..a0eb2161e3ef 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -16,6 +16,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> @@ -30,6 +31,18 @@ struct nft_ct { }; }; +static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, + enum nft_ct_keys k, + enum ip_conntrack_dir d) +{ + if (d < IP_CT_DIR_MAX) + return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : + atomic64_read(&c[d].packets); + + return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); +} + static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -114,6 +127,17 @@ static void nft_ct_get_eval(const struct nft_expr *expr, NF_CT_LABELS_MAX_SIZE - size); return; } + case NFT_CT_BYTES: /* fallthrough */ + case NFT_CT_PKTS: { + const struct nf_conn_acct *acct = nf_conn_acct_find(ct); + u64 count = 0; + + if (acct) + count = nft_ct_get_eval_counter(acct->counter, + priv->key, priv->dir); + memcpy(dest, &count, sizeof(count)); + return; + } #endif default: break; @@ -291,6 +315,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return -EINVAL; len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all); break; + case NFT_CT_BYTES: + case NFT_CT_PKTS: + /* no direction? return sum of original + reply */ + if (tb[NFTA_CT_DIRECTION] == NULL) + priv->dir = IP_CT_DIR_MAX; + len = sizeof(u64); + break; default: return -EOPNOTSUPP; } @@ -366,6 +397,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: @@ -373,6 +405,13 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; + break; + case NFT_CT_BYTES: + case NFT_CT_PKTS: + if (priv->dir < IP_CT_DIR_MAX && + nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + break; default: break; } diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c new file mode 100644 index 000000000000..2cc1e0ef56e8 --- /dev/null +++ b/net/netfilter/nft_dup_netdev.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_dup_netdev.h> + +struct nft_dup_netdev { + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_netdev *priv = nft_expr_priv(expr); + int oif = regs->data[priv->sreg_dev]; + + nf_dup_netdev_egress(pkt, oif); +} + +static const struct nla_policy nft_dup_netdev_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static int nft_dup_netdev_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_netdev *priv = nft_expr_priv(expr); + + if (tb[NFTA_DUP_SREG_DEV] == NULL) + return -EINVAL; + + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); +} + +static const struct nft_expr_ops nft_dup_netdev_ingress_ops; + +static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_netdev *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_netdev_type; +static const struct nft_expr_ops nft_dup_netdev_ops = { + .type = &nft_dup_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_netdev)), + .eval = nft_dup_netdev_eval, + .init = nft_dup_netdev_init, + .dump = nft_dup_netdev_dump, +}; + +static struct nft_expr_type nft_dup_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "dup", + .ops = &nft_dup_netdev_ops, + .policy = nft_dup_netdev_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_netdev_module_init(void) +{ + return nft_register_expr(&nft_dup_netdev_type); +} + +static void __exit nft_dup_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_dup_netdev_type); +} + +module_init(nft_dup_netdev_module_init); +module_exit(nft_dup_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_AF_EXPR(5, "dup"); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 513a8ef60a59..9dec3bd1b63c 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, } ext = nft_set_elem_ext(set, elem); - if (priv->expr != NULL) - nft_expr_clone(nft_set_ext_expr(ext), priv->expr); + if (priv->expr != NULL && + nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) + return NULL; return elem; } diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c new file mode 100644 index 000000000000..763ebc3e0b2b --- /dev/null +++ b/net/netfilter/nft_fwd_netdev.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_dup_netdev.h> + +struct nft_fwd_netdev { + enum nft_registers sreg_dev:8; +}; + +static void nft_fwd_netdev_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + int oif = regs->data[priv->sreg_dev]; + + nf_dup_netdev_egress(pkt, oif); + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { + [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, +}; + +static int nft_fwd_netdev_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + + if (tb[NFTA_FWD_SREG_DEV] == NULL) + return -EINVAL; + + priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); +} + +static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; + +static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_fwd_netdev *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_fwd_netdev_type; +static const struct nft_expr_ops nft_fwd_netdev_ops = { + .type = &nft_fwd_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), + .eval = nft_fwd_netdev_eval, + .init = nft_fwd_netdev_init, + .dump = nft_fwd_netdev_dump, +}; + +static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { + .family = NFPROTO_NETDEV, + .name = "fwd", + .ops = &nft_fwd_netdev_ops, + .policy = nft_fwd_netdev_policy, + .maxattr = NFTA_FWD_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fwd_netdev_module_init(void) +{ + return nft_register_expr(&nft_fwd_netdev_type); +} + +static void __exit nft_fwd_netdev_module_exit(void) +{ + nft_unregister_expr(&nft_fwd_netdev_type); +} + +module_init(nft_fwd_netdev_module_init); +module_exit(nft_fwd_netdev_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_AF_EXPR(5, "fwd"); diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 5d67938f8b2f..99d18578afc6 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -26,6 +26,7 @@ struct nft_limit { u64 rate; u64 nsecs; u32 burst; + bool invert; }; static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) @@ -44,11 +45,11 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) if (delta >= 0) { limit->tokens = delta; spin_unlock_bh(&limit_lock); - return false; + return limit->invert; } limit->tokens = tokens; spin_unlock_bh(&limit_lock); - return true; + return !limit->invert; } static int nft_limit_init(struct nft_limit *limit, @@ -78,6 +79,12 @@ static int nft_limit_init(struct nft_limit *limit, limit->rate = rate; } + if (tb[NFTA_LIMIT_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); + + if (flags & NFT_LIMIT_F_INV) + limit->invert = true; + } limit->last = ktime_get_ns(); return 0; @@ -86,13 +93,15 @@ static int nft_limit_init(struct nft_limit *limit, static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, enum nft_limit_type type) { + u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); u64 rate = limit->rate - limit->burst; if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || - nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type))) + nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || + nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) goto nla_put_failure; return 0; @@ -120,6 +129,7 @@ static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, + [NFTA_LIMIT_FLAGS] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9dfaf4d55ee0..fe885bf271c5 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -18,12 +18,16 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/smp.h> +#include <linux/static_key.h> #include <net/dst.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nft_meta.h> +#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -174,7 +178,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) goto err; - *dest = sk->sk_classid; + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif default: @@ -188,6 +192,13 @@ err: } EXPORT_SYMBOL_GPL(nft_meta_get_eval); +/* don't change or set _LOOPBACK, _USER, etc. */ +static bool pkt_type_ok(u32 p) +{ + return p == PACKET_HOST || p == PACKET_BROADCAST || + p == PACKET_MULTICAST || p == PACKET_OTHERHOST; +} + void nft_meta_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -203,6 +214,11 @@ void nft_meta_set_eval(const struct nft_expr *expr, case NFT_META_PRIORITY: skb->priority = value; break; + case NFT_META_PKTTYPE: + if (skb->pkt_type != value && + pkt_type_ok(value) && pkt_type_ok(skb->pkt_type)) + skb->pkt_type = value; + break; case NFT_META_NFTRACE: skb->nf_trace = 1; break; @@ -271,6 +287,24 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); +static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx) +{ + unsigned int hooks; + + switch (ctx->afi->family) { + case NFPROTO_BRIDGE: + hooks = 1 << NF_BR_PRE_ROUTING; + break; + case NFPROTO_NETDEV: + hooks = 1 << NF_NETDEV_INGRESS; + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + int nft_meta_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -288,6 +322,12 @@ int nft_meta_set_init(const struct nft_ctx *ctx, case NFT_META_NFTRACE: len = sizeof(u8); break; + case NFT_META_PKTTYPE: + err = nft_meta_set_init_pkttype(ctx); + if (err) + return err; + len = sizeof(u8); + break; default: return -EOPNOTSUPP; } @@ -297,6 +337,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (priv->key == NFT_META_NFTRACE) + static_branch_inc(&nft_trace_enabled); + return 0; } EXPORT_SYMBOL_GPL(nft_meta_set_init); @@ -334,6 +377,16 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_meta_set_dump); +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + if (priv->key == NFT_META_NFTRACE) + static_branch_dec(&nft_trace_enabled); +} +EXPORT_SYMBOL_GPL(nft_meta_set_destroy); + static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, @@ -348,6 +401,7 @@ static const struct nft_expr_ops nft_meta_set_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_set_eval, .init = nft_meta_set_init, + .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 09b4b07eb676..12cd4bf16d17 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -107,10 +107,13 @@ err: } static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { - [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, @@ -160,6 +163,118 @@ const struct nft_expr_ops nft_payload_fast_ops = { .dump = nft_payload_dump, }; +static void nft_payload_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_payload_set *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + const u32 *src = ®s->data[priv->sreg]; + int offset, csum_offset; + __wsum fsum, tsum; + __sum16 sum; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + if (!skb_mac_header_was_set(skb)) + goto err; + offset = skb_mac_header(skb) - skb->data; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + offset = skb_network_offset(skb); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + offset = pkt->xt.thoff; + break; + default: + BUG(); + } + + csum_offset = offset + priv->csum_offset; + offset += priv->offset; + + if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || + skb->ip_summed != CHECKSUM_PARTIAL)) { + if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + goto err; + + fsum = skb_checksum(skb, offset, priv->len, 0); + tsum = csum_partial(src, priv->len, 0); + sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum), + tsum)); + if (sum == 0) + sum = CSUM_MANGLED_0; + + if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + goto err; + } + + if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + skb_store_bits(skb, offset, src, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload_set *priv = nft_expr_priv(expr); + + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]); + + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) + priv->csum_type = + ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) + priv->csum_offset = + ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + + switch (priv->csum_type) { + case NFT_PAYLOAD_CSUM_NONE: + case NFT_PAYLOAD_CSUM_INET: + break; + default: + return -EOPNOTSUPP; + } + + return nft_validate_register_load(priv->sreg, priv->len); +} + +static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_payload_set *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_PAYLOAD_SREG, priv->sreg) || + nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || + nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, + htonl(priv->csum_offset))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_payload_set_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), + .eval = nft_payload_set_eval, + .init = nft_payload_set_init, + .dump = nft_payload_set_dump, +}; + static const struct nft_expr_ops * nft_payload_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -167,8 +282,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, enum nft_payload_bases base; unsigned int offset, len; - if (tb[NFTA_PAYLOAD_DREG] == NULL || - tb[NFTA_PAYLOAD_BASE] == NULL || + if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || tb[NFTA_PAYLOAD_LEN] == NULL) return ERR_PTR(-EINVAL); @@ -183,6 +297,15 @@ nft_payload_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EOPNOTSUPP); } + if (tb[NFTA_PAYLOAD_SREG] != NULL) { + if (tb[NFTA_PAYLOAD_DREG] != NULL) + return ERR_PTR(-EINVAL); + return &nft_payload_set_ops; + } + + if (tb[NFTA_PAYLOAD_DREG] == NULL) + return ERR_PTR(-EINVAL); + offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d4aaad747ea9..c8a0b7da5ff4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/audit.h> +#include <linux/user_namespace.h> #include <net/net_namespace.h> #include <linux/netfilter/x_tables.h> @@ -1226,6 +1227,8 @@ int xt_proto_init(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; struct proc_dir_entry *proc; + kuid_t root_uid; + kgid_t root_gid; #endif if (af >= ARRAY_SIZE(xt_prefix)) @@ -1233,12 +1236,17 @@ int xt_proto_init(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS + root_uid = make_kuid(net->user_ns, 0); + root_gid = make_kgid(net->user_ns, 0); + strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops, (void *)(unsigned long)af); if (!proc) goto out; + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(proc, root_uid, root_gid); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); @@ -1246,6 +1254,8 @@ int xt_proto_init(struct net *net, u_int8_t af) (void *)(unsigned long)af); if (!proc) goto out_remove_tables; + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(proc, root_uid, root_gid); strlcpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); @@ -1253,6 +1263,8 @@ int xt_proto_init(struct net *net, u_int8_t af) (void *)(unsigned long)af); if (!proc) goto out_remove_matches; + if (uid_valid(root_uid) && gid_valid(root_gid)) + proc_set_user(proc, root_uid, root_gid); #endif return 0; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index e7ac07e53b59..6669e68d589e 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -143,7 +143,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto out; } - timeout = timeout_find_get(timeout_name); + timeout = timeout_find_get(par->net, timeout_name); if (timeout == NULL) { ret = -ENOENT; pr_info("No such timeout policy \"%s\"\n", timeout_name); diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index a1d126f29463..a086a914865f 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -24,9 +24,9 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); -static int cgroup_mt_check(const struct xt_mtchk_param *par) +static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { - struct xt_cgroup_info *info = par->matchinfo; + struct xt_cgroup_info_v0 *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; @@ -34,38 +34,110 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par) return 0; } +static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info_v1 *info = par->matchinfo; + struct cgroup *cgrp; + + if ((info->invert_path & ~1) || (info->invert_classid & ~1)) + return -EINVAL; + + if (!info->has_path && !info->has_classid) { + pr_info("xt_cgroup: no path or classid specified\n"); + return -EINVAL; + } + + if (info->has_path && info->has_classid) { + pr_info("xt_cgroup: both path and classid specified\n"); + return -EINVAL; + } + + if (info->has_path) { + cgrp = cgroup_get_from_path(info->path); + if (IS_ERR(cgrp)) { + pr_info("xt_cgroup: invalid path, errno=%ld\n", + PTR_ERR(cgrp)); + return -EINVAL; + } + info->priv = cgrp; + } + + return 0; +} + static bool -cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { - const struct xt_cgroup_info *info = par->matchinfo; + const struct xt_cgroup_info_v0 *info = par->matchinfo; if (skb->sk == NULL || !sk_fullsock(skb->sk)) return false; - return (info->id == skb->sk->sk_classid) ^ info->invert; + return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ + info->invert; +} + +static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info_v1 *info = par->matchinfo; + struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; + struct cgroup *ancestor = info->priv; + + if (!skb->sk || !sk_fullsock(skb->sk)) + return false; + + if (ancestor) + return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ + info->invert_path; + else + return (info->classid == sock_cgroup_classid(skcd)) ^ + info->invert_classid; +} + +static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) +{ + struct xt_cgroup_info_v1 *info = par->matchinfo; + + if (info->priv) + cgroup_put(info->priv); } -static struct xt_match cgroup_mt_reg __read_mostly = { - .name = "cgroup", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = cgroup_mt_check, - .match = cgroup_mt, - .matchsize = sizeof(struct xt_cgroup_info), - .me = THIS_MODULE, - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_IN), +static struct xt_match cgroup_mt_reg[] __read_mostly = { + { + .name = "cgroup", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v0, + .match = cgroup_mt_v0, + .matchsize = sizeof(struct xt_cgroup_info_v0), + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, + { + .name = "cgroup", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v1, + .match = cgroup_mt_v1, + .matchsize = sizeof(struct xt_cgroup_info_v1), + .destroy = cgroup_mt_destroy_v1, + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) { - return xt_register_match(&cgroup_mt_reg); + return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } static void __exit cgroup_mt_exit(void) { - xt_unregister_match(&cgroup_mt_reg); + xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } module_init(cgroup_mt_init); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index df8801e02a32..4e3c3affd285 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -61,8 +61,8 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, }; -static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int xt_osf_add_callback(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[]) { struct xt_osf_user_finger *f; @@ -104,7 +104,8 @@ static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, return err; } -static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, +static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[]) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 59651af8cc27..81dc1bb6e016 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2915,6 +2915,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); + cb->start = control->start; cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -2927,6 +2928,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + if (cb->start) + cb->start(cb); + ret = netlink_dump(sk); sock_put(sk); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index bc0e504f33a6..f830326b3b1d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -185,7 +185,7 @@ static int genl_allocate_reserve_groups(int n_groups, int *first_id) } } - if (id >= mc_groups_longs * BITS_PER_LONG) { + if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); @@ -513,6 +513,20 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, } EXPORT_SYMBOL(genlmsg_put); +static int genl_lock_start(struct netlink_callback *cb) +{ + /* our ops are always const - netlink API doesn't propagate that */ + const struct genl_ops *ops = cb->data; + int rc = 0; + + if (ops->start) { + genl_lock(); + rc = ops->start(cb); + genl_unlock(); + } + return rc; +} + static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* our ops are always const - netlink API doesn't propagate that */ @@ -577,6 +591,7 @@ static int genl_family_rcv_msg(struct genl_family *family, .module = family->module, /* we have const, but the netlink API doesn't */ .data = (void *)ops, + .start = genl_lock_start, .dump = genl_lock_dumpit, .done = genl_lock_done, }; @@ -588,6 +603,7 @@ static int genl_family_rcv_msg(struct genl_family *family, } else { struct netlink_dump_control c = { .module = family->module, + .start = ops->start, .dump = ops->dumpit, .done = ops->done, }; diff --git a/net/nfc/core.c b/net/nfc/core.c index 1fe3d3b362c0..122bb81da918 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -953,6 +953,19 @@ out: } EXPORT_SYMBOL(nfc_se_transaction); +int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx) +{ + int rc; + + pr_debug("connectivity: %x\n", se_idx); + + device_lock(&dev->dev); + rc = nfc_genl_se_connectivity(dev, se_idx); + device_unlock(&dev->dev); + return rc; +} +EXPORT_SYMBOL(nfc_se_connectivity); + static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 23c2a118ac9f..dd9003f38822 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -20,7 +20,8 @@ #include "digital.h" #define DIGITAL_PROTO_NFCA_RF_TECH \ - (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | NFC_PROTO_NFC_DEP_MASK) + (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | \ + NFC_PROTO_NFC_DEP_MASK | NFC_PROTO_ISO14443_MASK) #define DIGITAL_PROTO_NFCB_RF_TECH NFC_PROTO_ISO14443_B_MASK diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index b7de0da46acd..ecf0a0196f18 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -572,7 +572,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock, if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 10c99a578421..fbb7a2b57b44 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -610,14 +610,14 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, struct nci_core_conn_create_cmd *cmd; struct core_conn_create_data data; + if (!number_destination_params) + return -EINVAL; + data.length = params_len + sizeof(struct nci_core_conn_create_cmd); cmd = kzalloc(data.length, GFP_KERNEL); if (!cmd) return -ENOMEM; - if (!number_destination_params) - return -EINVAL; - cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; memcpy(cmd->params, params, params_len); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 2aedac15cb59..a0ab26d535dc 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -676,7 +676,7 @@ int nci_hci_connect_gate(struct nci_dev *ndev, break; default: pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); - if (pipe < 0) + if (pipe == NCI_HCI_INVALID_PIPE) return r; pipe_created = true; break; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f58c1fba1026..ea023b35f1c2 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -552,6 +552,43 @@ free_msg: return -EMSGSIZE; } +int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) +{ + struct nfc_se *se; + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, + NFC_EVENT_SE_CONNECTIVITY); + if (!hdr) + goto free_msg; + + se = nfc_find_se(dev, se_idx); + if (!se) + goto free_msg; + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || + nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index c20b784ad720..6c6f76b370b1 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -105,6 +105,7 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type); int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx); int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction); +int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx); struct nfc_dev *nfc_get_device(unsigned int idx); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c88d0f2d3e01..2d59df521915 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1160,17 +1160,26 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { - int level = this_cpu_read(exec_actions_level); - int err; + static const int ovs_recursion_limit = 5; + int err, level; + + level = __this_cpu_inc_return(exec_actions_level); + if (unlikely(level > ovs_recursion_limit)) { + net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", + ovs_dp_name(dp)); + kfree_skb(skb); + err = -ENETDOWN; + goto out; + } - this_cpu_inc(exec_actions_level); err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); - if (!level) + if (level == 1) process_deferred_actions(dp); - this_cpu_dec(exec_actions_level); +out: + __this_cpu_dec(exec_actions_level); return err; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c2cc11168fd5..ee6ff8ffc12d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -53,6 +53,8 @@ struct ovs_conntrack_info { struct md_labels labels; }; +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); + static u16 key_to_nfproto(const struct sw_flow_key *key) { switch (ntohs(key->eth.type)) { @@ -141,6 +143,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, * previously sent the packet to conntrack via the ct action. */ static void ovs_ct_update_key(const struct sk_buff *skb, + const struct ovs_conntrack_info *info, struct sw_flow_key *key, bool post_ct) { const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; @@ -158,13 +161,15 @@ static void ovs_ct_update_key(const struct sk_buff *skb, zone = nf_ct_zone(ct); } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + if (info) + zone = &info->zone; } __ovs_ct_update_key(key, state, zone, ct); } void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { - ovs_ct_update_key(skb, key, false); + ovs_ct_update_key(skb, NULL, key, false); } int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) @@ -300,10 +305,10 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, u16 zone, struct sk_buff *skb) { struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + int err; if (key->eth.type == htons(ETH_P_IP)) { enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; - int err; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); err = ip_defrag(net, skb, user); @@ -314,28 +319,13 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) } else if (key->eth.type == htons(ETH_P_IPV6)) { enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - struct sk_buff *reasm; memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - reasm = nf_ct_frag6_gather(net, skb, user); - if (!reasm) - return -EINPROGRESS; - - if (skb == reasm) { - kfree_skb(skb); - return -EINVAL; - } - - /* Don't free 'skb' even though it is one of the original - * fragments, as we're going to morph it into the head. - */ - skb_get(skb); - nf_ct_frag6_consume_orig(reasm); + err = nf_ct_frag6_gather(net, skb, user); + if (err) + return err; - key->ip.proto = ipv6_hdr(reasm)->nexthdr; - skb_morph(skb, reasm); - skb->next = reasm->next; - consume_skb(reasm); + key->ip.proto = ipv6_hdr(skb)->nexthdr; ovs_cb.mru = IP6CB(skb)->frag_max_size; #endif } else { @@ -418,7 +408,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, } } - ovs_ct_update_key(skb, key, true); + ovs_ct_update_key(skb, info, key, true); return 0; } @@ -693,6 +683,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -704,11 +698,9 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: - nf_conntrack_free(ct_info.ct); + __ovs_ct_free_action(&ct_info); return err; } @@ -750,6 +742,11 @@ void ovs_ct_free_action(const struct nlattr *a) { struct ovs_conntrack_info *ct_info = nla_data(a); + __ovs_ct_free_action(ct_info); +} + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) +{ if (ct_info->helper) module_put(ct_info->helper->me); if (ct_info->ct) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 91a8b004dc51..deadfdab1bc3 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -336,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, unsigned short gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; int err; - ovs_cb = *OVS_CB(skb); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) @@ -359,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, /* Queue all of the segments. */ skb = segs; do { - *OVS_CB(skb) = ovs_cb; if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index a7a80a6b77b0..653d073bae45 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -58,7 +58,7 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 907d6fd28ede..d1bd4a45ca2d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2434,7 +2434,10 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ovs_nla_put_tunnel_info(skb, tun_info); + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index efb736bb6855..30ab8e127288 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -34,7 +34,7 @@ static struct vport_ops ovs_geneve_vport_ops; * @dst_port: destination port. */ struct geneve_port { - u16 port_no; + u16 dst_port; }; static inline struct geneve_port *geneve_vport(const struct vport *vport) @@ -47,7 +47,7 @@ static int geneve_get_options(const struct vport *vport, { struct geneve_port *geneve_port = geneve_vport(vport); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->dst_port)) return -EMSGSIZE; return 0; } @@ -83,7 +83,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - geneve_port->port_no = dst_port; + geneve_port->dst_port = dst_port; rtnl_lock(); dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); @@ -117,7 +117,6 @@ static struct vport_ops ovs_geneve_vport_ops = { .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, .send = dev_queue_xmit, - .owner = THIS_MODULE, }; static int __init ovs_geneve_tnl_init(void) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index c3257d78d3d2..7f8897f33a67 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -89,7 +89,6 @@ static struct vport_ops ovs_gre_vport_ops = { .create = gre_create, .send = dev_queue_xmit, .destroy = ovs_netdev_tunnel_destroy, - .owner = THIS_MODULE, }; static int __init ovs_gre_tnl_init(void) diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index b327368a3848..6a6adf314363 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp)); + get_dpdev(vport->dp), NULL, NULL); if (err) goto error_unlock; @@ -180,9 +180,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); - /* Early release so we can unregister the device */ + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev); dev_put(vport->dev); - rtnl_delete_link(vport->dev); vport->dev = NULL; rtnl_unlock(); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 0ac0fd004d7e..31cbc8c5c7db 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -71,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name) return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } -int ovs_vport_ops_register(struct vport_ops *ops) +int __ovs_vport_ops_register(struct vport_ops *ops) { int err = -EEXIST; struct vport_ops *o; @@ -87,7 +87,7 @@ errout: ovs_unlock(); return err; } -EXPORT_SYMBOL_GPL(ovs_vport_ops_register); +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); void ovs_vport_ops_unregister(struct vport_ops *ops) { @@ -256,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * * @vport: vport to delete. * - * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. ovs_mutex must be held. + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. */ void ovs_vport_del(struct vport *vport) { diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index bdfd82a7c064..c10899cb9040 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -70,7 +70,7 @@ struct vport_portids { /** * struct vport - one port within a datapath - * @rcu: RCU callback head for deferred destruction. + * @dev: Pointer to net_device. * @dp: Datapath to which this port belongs. * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. @@ -78,6 +78,7 @@ struct vport_portids { * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. * @detach_list: list used for detaching vport in net-exit call. + * @rcu: RCU callback head for deferred destruction. */ struct vport { struct net_device *dev; @@ -196,28 +197,14 @@ static inline const char *ovs_vport_name(struct vport *vport) return vport->dev->name; } -int ovs_vport_ops_register(struct vport_ops *ops); -void ovs_vport_ops_unregister(struct vport_ops *ops); - -static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ip_tunnel_key *key, - u32 mark, - struct flowi4 *fl, - u8 protocol) -{ - struct rtable *rt; - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = mark; - fl->flowi4_proto = protocol; - - rt = ip_route_output_key(net, fl); - return rt; -} +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) +void ovs_vport_ops_unregister(struct vport_ops *ops); void ovs_vport_send(struct vport *vport, struct sk_buff *skb); #endif /* vport.h */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index af399cac5205..992396aa635c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1741,6 +1741,20 @@ static void fanout_release(struct sock *sk) kfree_rcu(po->rollover, rcu); } +static bool packet_extra_vlan_len_allowed(const struct net_device *dev, + struct sk_buff *skb) +{ + /* Earlier code assumed this would be a VLAN pkt, double-check + * this now that we have the actual packet in hand. We can only + * do this check on Ethernet devices. + */ + if (unlikely(dev->type != ARPHRD_ETHER)) + return false; + + skb_reset_mac_header(skb); + return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); +} + static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; @@ -1902,18 +1916,10 @@ retry: goto retry; } - if (len > (dev->mtu + dev->hard_header_len + extra_len)) { - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ - struct ethhdr *ehdr; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) { - err = -EMSGSIZE; - goto out_unlock; - } + if (len > (dev->mtu + dev->hard_header_len + extra_len) && + !packet_extra_vlan_len_allowed(dev, skb)) { + err = -EMSGSIZE; + goto out_unlock; } skb->protocol = proto; @@ -2323,8 +2329,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb) static bool ll_header_truncated(const struct net_device *dev, int len) { /* net device doesn't like empty head */ - if (unlikely(len <= dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n", + if (unlikely(len < dev->hard_header_len)) { + net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", current->comm, len, dev->hard_header_len); return true; } @@ -2332,6 +2338,15 @@ static bool ll_header_truncated(const struct net_device *dev, int len) return false; } +static void tpacket_set_protocol(const struct net_device *dev, + struct sk_buff *skb) +{ + if (dev->type == ARPHRD_ETHER) { + skb_reset_mac_header(skb); + skb->protocol = eth_hdr(skb)->h_proto; + } +} + static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, int size_max, __be16 proto, unsigned char *addr, int hlen) @@ -2368,8 +2383,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb_reserve(skb, hlen); skb_reset_network_header(skb); - if (!packet_use_direct_xmit(po)) - skb_probe_transport_header(skb, 0); if (unlikely(po->tp_tx_has_off)) { int off_min, off_max, off; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); @@ -2415,6 +2428,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, dev->hard_header_len); if (unlikely(err)) return err; + if (!skb->protocol) + tpacket_set_protocol(dev, skb); data += dev->hard_header_len; to_write -= dev->hard_header_len; @@ -2449,6 +2464,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, len = ((to_write > len_max) ? len_max : to_write); } + skb_probe_transport_header(skb, 0); + return tp_len; } @@ -2493,12 +2510,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - reserve = dev->hard_header_len + VLAN_HLEN; + if (po->sk.sk_socket->type == SOCK_RAW) + reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if (size_max > dev->mtu + reserve) - size_max = dev->mtu + reserve; + if (size_max > dev->mtu + reserve + VLAN_HLEN) + size_max = dev->mtu + reserve + VLAN_HLEN; do { ph = packet_current_frame(po, &po->tx_ring, @@ -2525,18 +2543,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, addr, hlen); if (likely(tp_len >= 0) && - tp_len > dev->mtu + dev->hard_header_len) { - struct ethhdr *ehdr; - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ + tp_len > dev->mtu + reserve && + !packet_extra_vlan_len_allowed(dev, skb)) + tp_len = -EMSGSIZE; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) - tp_len = -EMSGSIZE; - } if (unlikely(tp_len < 0)) { if (po->tp_loss) { __packet_set_status(po, ph, @@ -2765,18 +2775,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); - if (!gso_type && (len > dev->mtu + reserve + extra_len)) { - /* Earlier code assumed this would be a VLAN pkt, - * double-check this now that we have the actual - * packet in hand. - */ - struct ethhdr *ehdr; - skb_reset_mac_header(skb); - ehdr = eth_hdr(skb); - if (ehdr->h_proto != htons(ETH_P_8021Q)) { - err = -EMSGSIZE; - goto out_free; - } + if (!gso_type && (len > dev->mtu + reserve + extra_len) && + !packet_extra_vlan_len_allowed(dev, skb)) { + err = -EMSGSIZE; + goto out_free; } skb->protocol = proto; @@ -2807,8 +2809,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) len += vnet_hdr_len; } - if (!packet_use_direct_xmit(po)) - skb_probe_transport_header(skb, reserve); + skb_probe_transport_header(skb, reserve); + if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -4107,7 +4109,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; - if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) + if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; if (po->tp_version >= TPACKET_V3 && (int)(req->tp_block_size - @@ -4119,8 +4121,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; - rb->frames_per_block = req->tp_block_size/req->tp_frame_size; - if (unlikely(rb->frames_per_block <= 0)) + rb->frames_per_block = req->tp_block_size / req->tp_frame_size; + if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 10d42f3220ab..f925753668a7 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -377,6 +377,10 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct sockaddr_pn sa; u16 len; + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + /* check we have at least a full Phonet header */ if (!pskb_pull(skb, sizeof(struct phonethdr))) goto out; diff --git a/net/rds/connection.c b/net/rds/connection.c index d4564036a339..e3b118cae81d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -186,12 +186,6 @@ static struct rds_connection *__rds_conn_create(struct net *net, } } - if (trans == NULL) { - kmem_cache_free(rds_conn_slab, conn); - conn = ERR_PTR(-ENODEV); - goto out; - } - conn->c_trans = trans; ret = trans->conn_alloc(conn, gfp); diff --git a/net/rds/ib.c b/net/rds/ib.c index f222885ac0c7..9481d55ff6cb 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct ib_device_attr *dev_attr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - goto free_attr; + return; spin_lock_init(&rds_ibdev->spinlock); atomic_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - rds_ibdev->max_wrs = dev_attr->max_qp_wr; - rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; + rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? - min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; + rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; - rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? - min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; - rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; - rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); @@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", - dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, rds_ibdev->max_8k_fmrs); @@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device) put_dev: rds_ib_dev_put(rds_ibdev); -free_attr: - kfree(dev_attr); } /* diff --git a/net/rds/iw.c b/net/rds/iw.c index 576f1825fc55..f4a9fff829e0 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns); static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; - struct ib_device_attr *dev_attr; /* Only handle iwarp devices */ if (device->node_type != RDMA_NODE_RNIC) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); if (!rds_iwdev) - goto free_attr; + return; spin_lock_init(&rds_iwdev->spinlock); - rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = dev_attr->max_qp_wr; - rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = device->attrs.max_qp_wr; + rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); @@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device) list_add_tail(&rds_iwdev->list, &rds_iw_devices); ib_set_client_data(device, &rds_iw_client, rds_iwdev); - - goto free_attr; + return; err_mr: if (rds_iwdev->mr) @@ -121,8 +110,6 @@ err_pd: ib_dealloc_pd(rds_iwdev->pd); free_dev: kfree(rds_iwdev); -free_attr: - kfree(dev_attr); } static void rds_iw_remove_one(struct ib_device *device, void *client_data) diff --git a/net/rds/page.c b/net/rds/page.c index 9005a2c920ee..5a14e6d6a926 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -179,37 +179,18 @@ out: } EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); -static int rds_page_remainder_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +void rds_page_exit(void) { - struct rds_page_remainder *rem; - long cpu = (long)hcpu; + unsigned int cpu; - rem = &per_cpu(rds_page_remainders, cpu); + for_each_possible_cpu(cpu) { + struct rds_page_remainder *rem; - rdsdebug("cpu %ld action 0x%lx\n", cpu, action); + rem = &per_cpu(rds_page_remainders, cpu); + rdsdebug("cpu %u\n", cpu); - switch (action) { - case CPU_DEAD: if (rem->r_page) __free_page(rem->r_page); rem->r_page = NULL; - break; } - - return 0; -} - -static struct notifier_block rds_page_remainder_nb = { - .notifier_call = rds_page_remainder_cpu_notify, -}; - -void rds_page_exit(void) -{ - int i; - - for_each_possible_cpu(i) - rds_page_remainder_cpu_notify(&rds_page_remainder_nb, - (unsigned long)CPU_DEAD, - (void *)(long)i); } diff --git a/net/rds/send.c b/net/rds/send.c index 827155c2ead1..c9cdb358ea88 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1013,11 +1013,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) release_sock(sk); } - /* racing with another thread binding seems ok here */ + lock_sock(sk); if (daddr == 0 || rs->rs_bound_addr == 0) { + release_sock(sk); ret = -ENOTCONN; /* XXX not a great errno */ goto out; } + release_sock(sk); if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index b41e9ea2ffff..f53bf3b6558b 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -49,7 +49,6 @@ struct rfkill { spinlock_t lock; - const char *name; enum rfkill_type type; unsigned long state; @@ -73,6 +72,7 @@ struct rfkill { struct delayed_work poll_work; struct work_struct uevent_work; struct work_struct sync_work; + char name[]; }; #define to_rfkill(d) container_of(d, struct rfkill, dev) @@ -876,14 +876,14 @@ struct rfkill * __must_check rfkill_alloc(const char *name, if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES)) return NULL; - rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL); + rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL); if (!rfkill) return NULL; spin_lock_init(&rfkill->lock); INIT_LIST_HEAD(&rfkill->node); rfkill->type = type; - rfkill->name = name; + strcpy(rfkill->name, name); rfkill->ops = ops; rfkill->data = ops_data; diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 93127220cb54..4b1e3f35f06c 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -163,10 +163,6 @@ static int rfkill_gpio_remove(struct platform_device *pdev) #ifdef CONFIG_ACPI static const struct acpi_device_id rfkill_acpi_match[] = { - { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, - { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, - { "BCM2E40", RFKILL_TYPE_BLUETOOTH }, - { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, { "BCM4752", RFKILL_TYPE_GPS }, { "LNV4752", RFKILL_TYPE_GPS }, { }, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 1f8a144a5dc2..7e2d1057d8bc 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -67,7 +67,7 @@ static void rxrpc_write_space(struct sock *sk) if (rxrpc_writable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index e0547f521f20..adc555e0323d 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -723,8 +723,10 @@ process_further: if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) + hard > tx) { + call->acks_hard = tx; goto all_acked; + } smp_rmb(); rxrpc_rotate_tx_window(call, hard - 1); diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index da3cc09f683e..3f6571651d32 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -896,15 +896,9 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen) if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = kmalloc(optlen + 1, GFP_KERNEL); - if (!description) - return -ENOMEM; - - if (copy_from_user(description, optval, optlen)) { - kfree(description); - return -EFAULT; - } - description[optlen] = 0; + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); key = request_key(&key_type_rxrpc, description, NULL); if (IS_ERR(key)) { @@ -933,15 +927,9 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval, if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; - description = kmalloc(optlen + 1, GFP_KERNEL); - if (!description) - return -ENOMEM; - - if (copy_from_user(description, optval, optlen)) { - kfree(description); - return -EFAULT; - } - description[optlen] = 0; + description = memdup_user_nul(optval, optlen); + if (IS_ERR(description)) + return PTR_ERR(description); key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index a40d3afe93b7..14c4e12c47b0 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -531,7 +531,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ - clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) return -EPIPE; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index daa33432b716..82830824fb1f 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -310,15 +310,21 @@ config NET_SCH_PIE If unsure, say N. config NET_SCH_INGRESS - tristate "Ingress Qdisc" + tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT select NET_INGRESS + select NET_EGRESS ---help--- - Say Y here if you want to use classifiers for incoming packets. + Say Y here if you want to use classifiers for incoming and/or outgoing + packets. This qdisc doesn't do anything else besides running classifiers, + which can also have actions attached to them. In case of outgoing packets, + classifiers that this qdisc holds are executed in the transmit path + before real enqueuing to an egress qdisc happens. + If unsure, say Y. - To compile this code as a module, choose M here: the - module will be called sch_ingress. + To compile this code as a module, choose M here: the module will be + called sch_ingress with alias of sch_clsact. config NET_SCH_PLUG tristate "Plug network traffic until release (PLUG)" diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5faaa5425f7b..8dc84300ee79 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -79,12 +79,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); + bool at_ingress = skb_at_tc_ingress(skb); struct cls_bpf_prog *prog; -#ifdef CONFIG_NET_CLS_ACT - bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; -#else - bool at_ingress = false; -#endif int ret = -1; if (unlikely(!skb_mac_header_was_set(skb))) @@ -295,7 +291,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, prog->bpf_name = name; prog->filter = fp; - if (fp->dst_needed) + if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS)) netif_keep_dst(qdisc_dev(tp->q)); return 0; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 57692947ebbe..95b021243233 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -252,23 +252,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)); + fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto)); + if (key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) { fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); } - if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + + if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)); fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)); - } else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) { + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)); @@ -276,6 +281,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, sizeof(key->ipv6.dst)); } + if (key->basic.ip_proto == IPPROTO_TCP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, &mask->tp.src, TCA_FLOWER_UNSPEC, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f43c8f33f09e..b5c2cf2aa6d4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -253,7 +253,8 @@ int qdisc_set_default(const char *name) } /* We know handle. Find qdisc among all qdisc's attached to device - (root qdisc, all its children, children of children etc.) + * (root qdisc, all its children, children of children etc.) + * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) @@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) root->handle == handle) return root; - list_for_each_entry(q, &root->list, list) { + list_for_each_entry_rcu(q, &root->list, list) { if (q->handle == handle) return q; } @@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q) struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); - list_add_tail(&q->list, &root->list); + ASSERT_RTNL(); + list_add_tail_rcu(&q->list, &root->list); } } EXPORT_SYMBOL(qdisc_list_add); void qdisc_list_del(struct Qdisc *q) { - if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) - list_del(&q->list); + if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { + ASSERT_RTNL(); + list_del_rcu(&q->list); + } } EXPORT_SYMBOL(qdisc_list_del); @@ -750,14 +754,18 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) if (n == 0) return; drops = max_t(int, n, 0); + rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) - return; + break; + if (sch->flags & TCQ_F_NOPARENT) + break; + /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { - WARN_ON(parentid != TC_H_ROOT); - return; + WARN_ON_ONCE(parentid != TC_H_ROOT); + break; } cops = sch->ops->cl_ops; if (cops->qlen_notify) { @@ -768,6 +776,7 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) sch->q.qlen -= n; __qdisc_qstats_drop(sch, drops); } + rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_decrease_qlen); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cb5d4ad32946..16bc83b2842a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -658,8 +658,10 @@ static void qdisc_rcu_free(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) + if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); + free_percpu(qdisc->cpu_qstats); + } kfree((char *) qdisc - qdisc->padded); } @@ -737,7 +739,7 @@ static void attach_one_default_qdisc(struct net_device *dev, return; } if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e7c648fa9dc3..10adbc617905 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -1,4 +1,5 @@ -/* net/sched/sch_ingress.c - Ingress qdisc +/* net/sched/sch_ingress.c - Ingress and clsact qdisc + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -98,17 +99,100 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static unsigned long clsact_get(struct Qdisc *sch, u32 classid) +{ + switch (TC_H_MIN(classid)) { + case TC_H_MIN(TC_H_MIN_INGRESS): + case TC_H_MIN(TC_H_MIN_EGRESS): + return TC_H_MIN(classid); + default: + return 0; + } +} + +static unsigned long clsact_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return clsact_get(sch, classid); +} + +static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + + switch (cl) { + case TC_H_MIN(TC_H_MIN_INGRESS): + return &dev->ingress_cl_list; + case TC_H_MIN(TC_H_MIN_EGRESS): + return &dev->egress_cl_list; + default: + return NULL; + } +} + +static int clsact_init(struct Qdisc *sch, struct nlattr *opt) +{ + net_inc_ingress_queue(); + net_inc_egress_queue(); + + sch->flags |= TCQ_F_CPUSTATS; + + return 0; +} + +static void clsact_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + + tcf_destroy_chain(&dev->ingress_cl_list); + tcf_destroy_chain(&dev->egress_cl_list); + + net_dec_ingress_queue(); + net_dec_egress_queue(); +} + +static const struct Qdisc_class_ops clsact_class_ops = { + .leaf = ingress_leaf, + .get = clsact_get, + .put = ingress_put, + .walk = ingress_walk, + .tcf_chain = clsact_find_tcf, + .bind_tcf = clsact_bind_filter, + .unbind_tcf = ingress_put, +}; + +static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { + .cl_ops = &clsact_class_ops, + .id = "clsact", + .init = clsact_init, + .destroy = clsact_destroy, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + static int __init ingress_module_init(void) { - return register_qdisc(&ingress_qdisc_ops); + int ret; + + ret = register_qdisc(&ingress_qdisc_ops); + if (!ret) { + ret = register_qdisc(&clsact_qdisc_ops); + if (ret) + unregister_qdisc(&ingress_qdisc_ops); + } + + return ret; } static void __exit ingress_module_exit(void) { unregister_qdisc(&ingress_qdisc_ops); + unregister_qdisc(&clsact_qdisc_ops); } module_init(ingress_module_init); module_exit(ingress_module_exit); +MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index f3cbaecd283a..3e82f047caaf 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -63,7 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; @@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 3811a745452c..ad70ecf57ce7 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,7 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; - qdisc->flags |= TCQ_F_ONETXQUEUE; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own @@ -209,7 +209,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); if (new) - new->flags |= TCQ_F_ONETXQUEUE; + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 559afd0ee7de..2bf8ec92dde4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -383,6 +383,7 @@ void sctp_association_free(struct sctp_association *asoc) list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); list_del_rcu(pos); + sctp_unhash_transport(transport); sctp_transport_free(transport); } @@ -500,6 +501,8 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, /* Remove this peer from the list. */ list_del_rcu(&peer->transports); + /* Remove this peer from the transport hashtable */ + sctp_unhash_transport(peer); /* Get the first transport of asoc. */ pos = asoc->peer.transport_addr_list.next; @@ -699,6 +702,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, /* Attach the remote transport to our asoc. */ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; + /* Add this peer into the transport hashtable */ + sctp_hash_transport(peer); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 4f15b7d730e1..1543e39f47c3 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -809,8 +809,8 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, if (!has_sha1) return -EINVAL; - memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0], - hmacs->shmac_num_idents * sizeof(__u16)); + for (i = 0; i < hmacs->shmac_num_idents; i++) + ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]); ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) + hmacs->shmac_num_idents * sizeof(__u16)); return 0; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9da76ba4d10f..2522a6175291 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -314,21 +314,16 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, } /* Find the association that goes with this chunk. - * We do a linear search of the associations for this endpoint. - * We return the matching transport address too. + * We lookup the transport from hashtable at first, then get association + * through t->assoc. */ -static struct sctp_association *__sctp_endpoint_lookup_assoc( +struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **transport) { struct sctp_association *asoc = NULL; - struct sctp_association *tmp; - struct sctp_transport *t = NULL; - struct sctp_hashbucket *head; - struct sctp_ep_common *epb; - int hash; - int rport; + struct sctp_transport *t; *transport = NULL; @@ -337,45 +332,16 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc( */ if (!ep->base.bind_addr.port) goto out; + t = sctp_epaddr_lookup_transport(ep, paddr); + if (!t) + goto out; - rport = ntohs(paddr->v4.sin_port); - - hash = sctp_assoc_hashfn(sock_net(ep->base.sk), ep->base.bind_addr.port, - rport); - head = &sctp_assoc_hashtable[hash]; - read_lock(&head->lock); - sctp_for_each_hentry(epb, &head->chain) { - tmp = sctp_assoc(epb); - if (tmp->ep != ep || rport != tmp->peer.port) - continue; - - t = sctp_assoc_lookup_paddr(tmp, paddr); - if (t) { - asoc = tmp; - *transport = t; - break; - } - } - read_unlock(&head->lock); + *transport = t; + asoc = t->asoc; out: return asoc; } -/* Lookup association on an endpoint based on a peer address. BH-safe. */ -struct sctp_association *sctp_endpoint_lookup_assoc( - const struct sctp_endpoint *ep, - const union sctp_addr *paddr, - struct sctp_transport **transport) -{ - struct sctp_association *asoc; - - local_bh_disable(); - asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport); - local_bh_enable(); - - return asoc; -} - /* Look for any peeled off association from the endpoint that matches the * given peer address. */ diff --git a/net/sctp/input.c b/net/sctp/input.c index b6493b3f11a9..bf61dfb8e09e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -782,65 +782,149 @@ hit: return ep; } -/* Insert association into the hash table. */ -static void __sctp_hash_established(struct sctp_association *asoc) +/* rhashtable for transport */ +struct sctp_hash_cmp_arg { + const struct sctp_endpoint *ep; + const union sctp_addr *laddr; + const union sctp_addr *paddr; + const struct net *net; +}; + +static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - struct net *net = sock_net(asoc->base.sk); - struct sctp_ep_common *epb; - struct sctp_hashbucket *head; + const struct sctp_hash_cmp_arg *x = arg->key; + const struct sctp_transport *t = ptr; + struct sctp_association *asoc = t->asoc; + const struct net *net = x->net; - epb = &asoc->base; + if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) + return 1; + if (!net_eq(sock_net(asoc->base.sk), net)) + return 1; + if (x->ep) { + if (x->ep != asoc->ep) + return 1; + } else { + if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port)) + return 1; + if (!sctp_bind_addr_match(&asoc->base.bind_addr, + x->laddr, sctp_sk(asoc->base.sk))) + return 1; + } - /* Calculate which chain this entry will belong to. */ - epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port, - asoc->peer.port); + return 0; +} - head = &sctp_assoc_hashtable[epb->hashent]; +static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct sctp_transport *t = data; + const union sctp_addr *paddr = &t->ipaddr; + const struct net *net = sock_net(t->asoc->base.sk); + u16 lport = htons(t->asoc->base.bind_addr.port); + u32 addr; + + if (paddr->sa.sa_family == AF_INET6) + addr = jhash(&paddr->v6.sin6_addr, 16, seed); + else + addr = paddr->v4.sin_addr.s_addr; - write_lock(&head->lock); - hlist_add_head(&epb->node, &head->chain); - write_unlock(&head->lock); + return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 | + (__force __u32)lport, net_hash_mix(net), seed); } -/* Add an association to the hash. Local BH-safe. */ -void sctp_hash_established(struct sctp_association *asoc) +static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed) { - if (asoc->temp) - return; + const struct sctp_hash_cmp_arg *x = data; + const union sctp_addr *paddr = x->paddr; + const struct net *net = x->net; + u16 lport; + u32 addr; + + lport = x->ep ? htons(x->ep->base.bind_addr.port) : + x->laddr->v4.sin_port; + if (paddr->sa.sa_family == AF_INET6) + addr = jhash(&paddr->v6.sin6_addr, 16, seed); + else + addr = paddr->v4.sin_addr.s_addr; - local_bh_disable(); - __sctp_hash_established(asoc); - local_bh_enable(); + return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 | + (__force __u32)lport, net_hash_mix(net), seed); } -/* Remove association from the hash table. */ -static void __sctp_unhash_established(struct sctp_association *asoc) +static const struct rhashtable_params sctp_hash_params = { + .head_offset = offsetof(struct sctp_transport, node), + .hashfn = sctp_hash_key, + .obj_hashfn = sctp_hash_obj, + .obj_cmpfn = sctp_hash_cmp, + .automatic_shrinking = true, +}; + +int sctp_transport_hashtable_init(void) { - struct net *net = sock_net(asoc->base.sk); - struct sctp_hashbucket *head; - struct sctp_ep_common *epb; + return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params); +} - epb = &asoc->base; +void sctp_transport_hashtable_destroy(void) +{ + rhashtable_destroy(&sctp_transport_hashtable); +} - epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port, - asoc->peer.port); +void sctp_hash_transport(struct sctp_transport *t) +{ + struct sctp_hash_cmp_arg arg; - head = &sctp_assoc_hashtable[epb->hashent]; + if (t->asoc->temp) + return; - write_lock(&head->lock); - hlist_del_init(&epb->node); - write_unlock(&head->lock); + arg.ep = t->asoc->ep; + arg.paddr = &t->ipaddr; + arg.net = sock_net(t->asoc->base.sk); + +reinsert: + if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg, + &t->node, sctp_hash_params) == -EBUSY) + goto reinsert; } -/* Remove association from the hash table. Local BH-safe. */ -void sctp_unhash_established(struct sctp_association *asoc) +void sctp_unhash_transport(struct sctp_transport *t) { - if (asoc->temp) + if (t->asoc->temp) return; - local_bh_disable(); - __sctp_unhash_established(asoc); - local_bh_enable(); + rhashtable_remove_fast(&sctp_transport_hashtable, &t->node, + sctp_hash_params); +} + +struct sctp_transport *sctp_addrs_lookup_transport( + struct net *net, + const union sctp_addr *laddr, + const union sctp_addr *paddr) +{ + struct sctp_hash_cmp_arg arg = { + .ep = NULL, + .laddr = laddr, + .paddr = paddr, + .net = net, + }; + + return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, + sctp_hash_params); +} + +struct sctp_transport *sctp_epaddr_lookup_transport( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr) +{ + struct net *net = sock_net(ep->base.sk); + struct sctp_hash_cmp_arg arg = { + .ep = ep, + .paddr = paddr, + .net = net, + }; + + return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg, + sctp_hash_params); } /* Look up an association. */ @@ -850,38 +934,19 @@ static struct sctp_association *__sctp_lookup_association( const union sctp_addr *peer, struct sctp_transport **pt) { - struct sctp_hashbucket *head; - struct sctp_ep_common *epb; - struct sctp_association *asoc; - struct sctp_transport *transport; - int hash; - - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - hash = sctp_assoc_hashfn(net, ntohs(local->v4.sin_port), - ntohs(peer->v4.sin_port)); - head = &sctp_assoc_hashtable[hash]; - read_lock(&head->lock); - sctp_for_each_hentry(epb, &head->chain) { - asoc = sctp_assoc(epb); - transport = sctp_assoc_is_match(asoc, net, local, peer); - if (transport) - goto hit; - } + struct sctp_transport *t; - read_unlock(&head->lock); + t = sctp_addrs_lookup_transport(net, local, peer); + if (!t || t->dead) + return NULL; - return NULL; + sctp_association_hold(t->asoc); + *pt = t; -hit: - *pt = transport; - sctp_association_hold(asoc); - read_unlock(&head->lock); - return asoc; + return t->asoc; } -/* Look up an association. BH-safe. */ +/* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, @@ -890,9 +955,9 @@ struct sctp_association *sctp_lookup_association(struct net *net, { struct sctp_association *asoc; - local_bh_disable(); + rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp); - local_bh_enable(); + rcu_read_unlock(); return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e917d27328ea..ec529121f38a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,6 +209,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); @@ -220,7 +221,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); - return ip6_xmit(sk, skb, fl6, np->opt, np->tclass); + rcu_read_lock(); + res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + rcu_read_unlock(); + return res; } /* Returns the dst cache entry for the given source and destination ip @@ -262,7 +266,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, pr_debug("src=%pI6 - ", &fl6->saddr); } - final_p = fl6_update_dst(fl6, np->opt, &final); + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + rcu_read_unlock(); + dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!asoc || saddr) goto out; @@ -316,14 +323,13 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, } } } - rcu_read_unlock(); - if (baddr) { fl6->saddr = baddr->v6.sin6_addr; fl6->fl6_sport = baddr->v6.sin6_port; - final_p = fl6_update_dst(fl6, np->opt, &final); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); dst = ip6_dst_lookup_flow(sk, fl6, final_p); } + rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { @@ -635,6 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; + struct ipv6_txoptions *opt; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0); if (!newsk) @@ -654,6 +661,13 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + rcu_read_lock(); + opt = rcu_dereference(np->opt); + if (opt) + opt = ipv6_dup_options(newsk, opt); + RCU_INIT_POINTER(newnp->opt, opt); + rcu_read_unlock(); + /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ diff --git a/net/sctp/output.c b/net/sctp/output.c index abe7c2db2412..9d610eddd19e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -534,7 +534,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. */ if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CSUM) || + if (!(dst->dev->features & NETIF_F_SCTP_CRC) || (dst_xfrm(dst) != NULL) || packet->ipfragok) { sh->checksum = sctp_compute_cksum(nskb, 0); } else { diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7e8f0a117106..c0380cfb16ae 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -324,6 +324,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); + sctp_chunk_hold(chunk); sctp_outq_tail_data(q, chunk); if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); @@ -1251,6 +1252,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) */ sack_a_rwnd = ntohl(sack->a_rwnd); + asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 0697eda5aed8..684c5b31563b 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -281,88 +281,136 @@ void sctp_eps_proc_exit(struct net *net) remove_proc_entry("eps", net->sctp.proc_net_sctp); } +struct sctp_ht_iter { + struct seq_net_private p; + struct rhashtable_iter hti; +}; -static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) +static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq) { - if (*pos >= sctp_assoc_hashsize) - return NULL; + struct sctp_ht_iter *iter = seq->private; + struct sctp_transport *t; - if (*pos < 0) - *pos = 0; + t = rhashtable_walk_next(&iter->hti); + for (; t; t = rhashtable_walk_next(&iter->hti)) { + if (IS_ERR(t)) { + if (PTR_ERR(t) == -EAGAIN) + continue; + break; + } - if (*pos == 0) - seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " - "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " - "RPORT LADDRS <-> RADDRS " - "HBINT INS OUTS MAXRT T1X T2X RTXC " - "wmema wmemq sndbuf rcvbuf\n"); + if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) && + t->asoc->peer.primary_path == t) + break; + } - return (void *)pos; + return t; } -static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) +static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq, + loff_t pos) +{ + void *obj = SEQ_START_TOKEN; + + while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj)) + pos--; + + return obj; +} + +static int sctp_transport_walk_start(struct seq_file *seq) { + struct sctp_ht_iter *iter = seq->private; + int err; + + err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti); + if (err) + return err; + + err = rhashtable_walk_start(&iter->hti); + + return err == -EAGAIN ? 0 : err; } +static void sctp_transport_walk_stop(struct seq_file *seq) +{ + struct sctp_ht_iter *iter = seq->private; + + rhashtable_walk_stop(&iter->hti); + rhashtable_walk_exit(&iter->hti); +} + +static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) +{ + int err = sctp_transport_walk_start(seq); + + if (err) + return ERR_PTR(err); + + return sctp_transport_get_idx(seq, *pos); +} + +static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) +{ + sctp_transport_walk_stop(seq); +} static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - if (++*pos >= sctp_assoc_hashsize) - return NULL; + ++*pos; - return pos; + return sctp_transport_get_next(seq); } /* Display sctp associations (/proc/net/sctp/assocs). */ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) { - struct sctp_hashbucket *head; - struct sctp_ep_common *epb; + struct sctp_transport *transport; struct sctp_association *assoc; + struct sctp_ep_common *epb; struct sock *sk; - int hash = *(loff_t *)v; - - if (hash >= sctp_assoc_hashsize) - return -ENOMEM; - head = &sctp_assoc_hashtable[hash]; - local_bh_disable(); - read_lock(&head->lock); - sctp_for_each_hentry(epb, &head->chain) { - assoc = sctp_assoc(epb); - sk = epb->sk; - if (!net_eq(sock_net(sk), seq_file_net(seq))) - continue; - seq_printf(seq, - "%8pK %8pK %-3d %-3d %-2d %-4d " - "%4d %8d %8d %7u %5lu %-5d %5d ", - assoc, sk, sctp_sk(sk)->type, sk->sk_state, - assoc->state, hash, - assoc->assoc_id, - assoc->sndbuf_used, - atomic_read(&assoc->rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), - sock_i_ino(sk), - epb->bind_addr.port, - assoc->peer.port); - seq_printf(seq, " "); - sctp_seq_dump_local_addrs(seq, epb); - seq_printf(seq, "<-> "); - sctp_seq_dump_remote_addrs(seq, assoc); - seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " - "%8d %8d %8d %8d", - assoc->hbinterval, assoc->c.sinit_max_instreams, - assoc->c.sinit_num_ostreams, assoc->max_retrans, - assoc->init_retries, assoc->shutdown_retries, - assoc->rtx_data_chunks, - atomic_read(&sk->sk_wmem_alloc), - sk->sk_wmem_queued, - sk->sk_sndbuf, - sk->sk_rcvbuf); - seq_printf(seq, "\n"); + if (v == SEQ_START_TOKEN) { + seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " + "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " + "RPORT LADDRS <-> RADDRS " + "HBINT INS OUTS MAXRT T1X T2X RTXC " + "wmema wmemq sndbuf rcvbuf\n"); + return 0; } - read_unlock(&head->lock); - local_bh_enable(); + + transport = (struct sctp_transport *)v; + assoc = transport->asoc; + epb = &assoc->base; + sk = epb->sk; + + seq_printf(seq, + "%8pK %8pK %-3d %-3d %-2d %-4d " + "%4d %8d %8d %7u %5lu %-5d %5d ", + assoc, sk, sctp_sk(sk)->type, sk->sk_state, + assoc->state, 0, + assoc->assoc_id, + assoc->sndbuf_used, + atomic_read(&assoc->rmem_alloc), + from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + sock_i_ino(sk), + epb->bind_addr.port, + assoc->peer.port); + seq_printf(seq, " "); + sctp_seq_dump_local_addrs(seq, epb); + seq_printf(seq, "<-> "); + sctp_seq_dump_remote_addrs(seq, assoc); + seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " + "%8d %8d %8d %8d", + assoc->hbinterval, assoc->c.sinit_max_instreams, + assoc->c.sinit_num_ostreams, assoc->max_retrans, + assoc->init_retries, assoc->shutdown_retries, + assoc->rtx_data_chunks, + atomic_read(&sk->sk_wmem_alloc), + sk->sk_wmem_queued, + sk->sk_sndbuf, + sk->sk_rcvbuf); + seq_printf(seq, "\n"); return 0; } @@ -378,7 +426,7 @@ static const struct seq_operations sctp_assoc_ops = { static int sctp_assocs_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &sctp_assoc_ops, - sizeof(struct seq_net_private)); + sizeof(struct sctp_ht_iter)); } static const struct file_operations sctp_assocs_seq_fops = { @@ -409,112 +457,94 @@ void sctp_assocs_proc_exit(struct net *net) static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) { - if (*pos >= sctp_assoc_hashsize) - return NULL; - - if (*pos < 0) - *pos = 0; + int err = sctp_transport_walk_start(seq); - if (*pos == 0) - seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " - "REM_ADDR_RTX START STATE\n"); + if (err) + return ERR_PTR(err); - return (void *)pos; + return sctp_transport_get_idx(seq, *pos); } static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - if (++*pos >= sctp_assoc_hashsize) - return NULL; + ++*pos; - return pos; + return sctp_transport_get_next(seq); } static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) { + sctp_transport_walk_stop(seq); } static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { - struct sctp_hashbucket *head; - struct sctp_ep_common *epb; struct sctp_association *assoc; struct sctp_transport *tsp; - int hash = *(loff_t *)v; - if (hash >= sctp_assoc_hashsize) - return -ENOMEM; + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " + "REM_ADDR_RTX START STATE\n"); + return 0; + } - head = &sctp_assoc_hashtable[hash]; - local_bh_disable(); - read_lock(&head->lock); - rcu_read_lock(); - sctp_for_each_hentry(epb, &head->chain) { - if (!net_eq(sock_net(epb->sk), seq_file_net(seq))) + tsp = (struct sctp_transport *)v; + assoc = tsp->asoc; + + list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, + transports) { + if (tsp->dead) continue; - assoc = sctp_assoc(epb); - list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, - transports) { - if (tsp->dead) - continue; + /* + * The remote address (ADDR) + */ + tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); + seq_printf(seq, " "); + /* + * The association ID (ASSOC_ID) + */ + seq_printf(seq, "%d ", tsp->asoc->assoc_id); + + /* + * If the Heartbeat is active (HB_ACT) + * Note: 1 = Active, 0 = Inactive + */ + seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); + + /* + * Retransmit time out (RTO) + */ + seq_printf(seq, "%lu ", tsp->rto); + + /* + * Maximum path retransmit count (PATH_MAX_RTX) + */ + seq_printf(seq, "%d ", tsp->pathmaxrxt); + + /* + * remote address retransmit count (REM_ADDR_RTX) + * Note: We don't have a way to tally this at the moment + * so lets just leave it as zero for the moment + */ + seq_puts(seq, "0 "); + + /* + * remote address start time (START). This is also not + * currently implemented, but we can record it with a + * jiffies marker in a subsequent patch + */ + seq_puts(seq, "0 "); + + /* + * The current state of this destination. I.e. + * SCTP_ACTIVE, SCTP_INACTIVE, ... + */ + seq_printf(seq, "%d", tsp->state); - /* - * The remote address (ADDR) - */ - tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); - seq_printf(seq, " "); - - /* - * The association ID (ASSOC_ID) - */ - seq_printf(seq, "%d ", tsp->asoc->assoc_id); - - /* - * If the Heartbeat is active (HB_ACT) - * Note: 1 = Active, 0 = Inactive - */ - seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); - - /* - * Retransmit time out (RTO) - */ - seq_printf(seq, "%lu ", tsp->rto); - - /* - * Maximum path retransmit count (PATH_MAX_RTX) - */ - seq_printf(seq, "%d ", tsp->pathmaxrxt); - - /* - * remote address retransmit count (REM_ADDR_RTX) - * Note: We don't have a way to tally this at the moment - * so lets just leave it as zero for the moment - */ - seq_puts(seq, "0 "); - - /* - * remote address start time (START). This is also not - * currently implemented, but we can record it with a - * jiffies marker in a subsequent patch - */ - seq_puts(seq, "0 "); - - /* - * The current state of this destination. I.e. - * SCTP_ACTIVE, SCTP_INACTIVE, ... - */ - seq_printf(seq, "%d", tsp->state); - - seq_printf(seq, "\n"); - } + seq_printf(seq, "\n"); } - rcu_read_unlock(); - read_unlock(&head->lock); - local_bh_enable(); - return 0; - } static const struct seq_operations sctp_remaddr_ops = { @@ -533,7 +563,7 @@ void sctp_remaddr_proc_exit(struct net *net) static int sctp_remaddr_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &sctp_remaddr_ops, - sizeof(struct seq_net_private)); + sizeof(struct sctp_ht_iter)); } static const struct file_operations sctp_remaddr_seq_fops = { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3d9ea9a48289..ab0d538a74ed 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1223,6 +1223,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; + /* Enable pf state by default */ + net->sctp.pf_enable = 1; + /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts @@ -1413,24 +1416,6 @@ static __init int sctp_init(void) for (order = 0; (1UL << order) < goal; order++) ; - do { - sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_hashbucket); - if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0) - continue; - sctp_assoc_hashtable = (struct sctp_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); - } while (!sctp_assoc_hashtable && --order > 0); - if (!sctp_assoc_hashtable) { - pr_err("Failed association hash alloc\n"); - status = -ENOMEM; - goto err_ahash_alloc; - } - for (i = 0; i < sctp_assoc_hashsize; i++) { - rwlock_init(&sctp_assoc_hashtable[i].lock); - INIT_HLIST_HEAD(&sctp_assoc_hashtable[i].chain); - } - /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = @@ -1452,7 +1437,7 @@ static __init int sctp_init(void) if ((sctp_port_hashsize > (64 * 1024)) && order > 0) continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order); + __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); @@ -1464,8 +1449,10 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - pr_info("Hash tables configured (established %d bind %d)\n", - sctp_assoc_hashsize, sctp_port_hashsize); + if (sctp_transport_hashtable_init()) + goto err_thash_alloc; + + pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize); sctp_sysctl_register(); @@ -1518,12 +1505,10 @@ err_register_defaults: get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); err_bhash_alloc: + sctp_transport_hashtable_destroy(); +err_thash_alloc: kfree(sctp_ep_hashtable); err_ehash_alloc: - free_pages((unsigned long)sctp_assoc_hashtable, - get_order(sctp_assoc_hashsize * - sizeof(struct sctp_hashbucket))); -err_ahash_alloc: percpu_counter_destroy(&sctp_sockets_allocated); err_percpu_counter_init: kmem_cache_destroy(sctp_chunk_cachep); @@ -1557,13 +1542,11 @@ static __exit void sctp_exit(void) sctp_sysctl_unregister(); - free_pages((unsigned long)sctp_assoc_hashtable, - get_order(sctp_assoc_hashsize * - sizeof(struct sctp_hashbucket))); - kfree(sctp_ep_hashtable); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); + kfree(sctp_ep_hashtable); + sctp_transport_hashtable_destroy(); percpu_counter_destroy(&sctp_sockets_allocated); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 763e06a55155..5d6a03fad378 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1652,7 +1652,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, - ktime_get()); + ktime_get_real()); /* Copy the peer's init packet. */ memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, @@ -1780,7 +1780,7 @@ no_hmac: if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else - kt = ktime_get(); + kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { /* diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 6098d4c42fa9..2e21384697c2 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, sctp_state_t state, struct sctp_endpoint *ep, - struct sctp_association *asoc, + struct sctp_association **asoc, void *event_arg, sctp_disposition_t status, sctp_cmd_seq_t *commands, @@ -477,6 +477,8 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands, struct sctp_transport *transport, int is_hb) { + struct net *net = sock_net(asoc->base.sk); + /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ @@ -503,7 +505,8 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands, * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ - if ((transport->state == SCTP_ACTIVE) && + if (net->sctp.pf_enable && + (transport->state == SCTP_ACTIVE) && (asoc->pf_retrans < transport->pathmaxrxt) && (transport->error_count > asoc->pf_retrans)) { @@ -863,7 +866,6 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds, (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) return; - sctp_unhash_established(asoc); sctp_association_free(asoc); } @@ -1123,7 +1125,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype, debug_post_sfn(); error = sctp_side_effects(event_type, subtype, state, - ep, asoc, event_arg, status, + ep, &asoc, event_arg, status, &commands, gfp); debug_post_sfx(); @@ -1136,7 +1138,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype, static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, sctp_state_t state, struct sctp_endpoint *ep, - struct sctp_association *asoc, + struct sctp_association **asoc, void *event_arg, sctp_disposition_t status, sctp_cmd_seq_t *commands, @@ -1151,7 +1153,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, * disposition SCTP_DISPOSITION_CONSUME. */ if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, - ep, asoc, + ep, *asoc, event_arg, status, commands, gfp))) goto bail; @@ -1174,11 +1176,12 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, break; case SCTP_DISPOSITION_DELETE_TCB: + case SCTP_DISPOSITION_ABORT: /* This should now be a command. */ + *asoc = NULL; break; case SCTP_DISPOSITION_CONSUME: - case SCTP_DISPOSITION_ABORT: /* * We should no longer have much work to do here as the * real work has been done as explicit commands above. @@ -1266,7 +1269,6 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, asoc = cmd->obj.asoc; BUG_ON(asoc->peer.primary_path == NULL); sctp_endpoint_add_asoc(ep, asoc); - sctp_hash_established(asoc); break; case SCTP_CMD_UPDATE_ASSOC: diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 6f46aa16cb76..f1f08c8f277b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2976,7 +2976,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS); goto discard_force; case SCTP_IERROR_NO_DATA: - goto consume; + return SCTP_DISPOSITION_ABORT; case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t)); @@ -3043,9 +3043,6 @@ discard_noforce: sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force); return SCTP_DISPOSITION_DISCARD; -consume: - return SCTP_DISPOSITION_CONSUME; - } /* @@ -3093,7 +3090,7 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net, case SCTP_IERROR_BAD_STREAM: break; case SCTP_IERROR_NO_DATA: - goto consume; + return SCTP_DISPOSITION_ABORT; case SCTP_IERROR_PROTO_VIOLATION: return sctp_sf_abort_violation(net, ep, asoc, chunk, commands, (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t)); @@ -3119,7 +3116,6 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); } -consume: return SCTP_DISPOSITION_CONSUME; } @@ -4825,11 +4821,9 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort( * if necessary to fill gaps. */ struct sctp_chunk *abort = arg; - sctp_disposition_t retval; - retval = SCTP_DISPOSITION_CONSUME; - - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); /* Even if we can't send the ABORT due to low memory delete the * TCB. This is a departure from our typical NOMEM handling. @@ -4844,7 +4838,7 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort( SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); - return retval; + return SCTP_DISPOSITION_ABORT; } /* We tried an illegal operation on an association which is closed. */ @@ -4959,14 +4953,13 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort( sctp_cmd_seq_t *commands) { struct sctp_chunk *abort = arg; - sctp_disposition_t retval; /* Stop T1-init timer */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); - retval = SCTP_DISPOSITION_CONSUME; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); @@ -4983,7 +4976,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort( sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, SCTP_PERR(SCTP_ERROR_USER_ABORT)); - return retval; + return SCTP_DISPOSITION_ABORT; } /* @@ -5412,7 +5405,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS); if (asoc->overall_error_count >= asoc->max_retrans) { - if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { + if (asoc->peer.zero_window_announced && + asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { /* * We are here likely because the receiver had its rwnd * closed for a while and we have not been able to diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 897c01c029ca..9bb80ec4c08f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -972,7 +972,7 @@ static int sctp_setsockopt_bindx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1228,7 +1228,6 @@ out_free: * To the hash table, try to unhash it, just in case, its a noop * if it wasn't hashed so we're safe */ - sctp_unhash_established(asoc); sctp_association_free(asoc); } return err; @@ -1301,8 +1300,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, int addrs_size, sctp_assoc_t *assoc_id) { - int err = 0; struct sockaddr *kaddrs; + gfp_t gfp = GFP_KERNEL; + int err = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, addrs, addrs_size); @@ -1315,7 +1315,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + if (sk->sk_socket->file) + gfp = GFP_USER | __GFP_NOWARN; + kaddrs = kmalloc(addrs_size, gfp); if (unlikely(!kaddrs)) return -ENOMEM; @@ -1501,7 +1503,6 @@ static void sctp_close(struct sock *sk, long timeout) * ABORT or SHUTDOWN based on the linger options. */ if (sctp_state(asoc, CLOSED)) { - sctp_unhash_established(asoc); sctp_association_free(asoc); continue; } @@ -1513,8 +1514,7 @@ static void sctp_close(struct sock *sk, long timeout) struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, NULL, 0); - if (chunk) - sctp_primitive_ABORT(net, asoc, chunk); + sctp_primitive_ABORT(net, asoc, chunk); } else sctp_primitive_SHUTDOWN(net, asoc, NULL); } @@ -1952,8 +1952,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { - sctp_chunk_hold(chunk); - /* Do accounting for the write space. */ sctp_set_owner_w(chunk); @@ -1966,15 +1964,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * breaks. */ err = sctp_primitive_SEND(net, asoc, datamsg); + sctp_datamsg_put(datamsg); /* Did the lower layer accept the chunk? */ - if (err) { - sctp_datamsg_free(datamsg); + if (err) goto out_free; - } pr_debug("%s: we sent primitively\n", __func__); - sctp_datamsg_put(datamsg); err = msg_len; if (unlikely(wait_connect)) { @@ -1988,10 +1984,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_unlock; out_free: - if (new_asoc) { - sctp_unhash_established(asoc); + if (new_asoc) sctp_association_free(asoc); - } out_unlock: release_sock(sk); @@ -4928,7 +4922,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); - addrs = kmalloc(space_left, GFP_KERNEL); + addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN); if (!addrs) return -ENOMEM; @@ -5777,7 +5771,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; - ids = kmalloc(len, GFP_KERNEL); + ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; @@ -6458,7 +6452,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sctp_writeable(sk)) { mask |= POLLOUT | POLLWRNORM; } else { - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and @@ -6801,26 +6795,30 @@ no_packet: static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; - struct socket *sock = sk->sk_socket; - if ((sctp_wspace(asoc) > 0) && sock) { - if (waitqueue_active(&asoc->wait)) - wake_up_interruptible(&asoc->wait); + if (sctp_wspace(asoc) <= 0) + return; - if (sctp_writeable(sk)) { - wait_queue_head_t *wq = sk_sleep(sk); + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); - if (wq && waitqueue_active(wq)) - wake_up_interruptible(wq); + if (sctp_writeable(sk)) { + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq) { + if (waitqueue_active(&wq->wait)) + wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, - SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } + rcu_read_unlock(); } } @@ -6978,7 +6976,7 @@ void sctp_data_ready(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -7163,6 +7161,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; + newsk->sk_tsflags = sk->sk_tsflags; newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; @@ -7195,6 +7194,11 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); + + security_sk_clone(sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, @@ -7375,6 +7379,13 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) +#include <net/transp_v6.h> +static void sctp_v6_destroy_sock(struct sock *sk) +{ + sctp_destroy_sock(sk); + inet6_destroy_sock(sk); +} + struct proto sctpv6_prot = { .name = "SCTPv6", .owner = THIS_MODULE, @@ -7384,7 +7395,7 @@ struct proto sctpv6_prot = { .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_init_sock, - .destroy = sctp_destroy_sock, + .destroy = sctp_v6_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 26d50c565f54..daf8554fd42a 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -308,6 +308,13 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &max_autoclose_min, .extra2 = &max_autoclose_max, }, + { + .procname = "pf_enable", + .data = &init_net.sctp.pf_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { /* sentinel */ } }; @@ -320,7 +327,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, struct ctl_table tbl; bool changed = false; char *none = "none"; - char tmp[8]; + char tmp[8] = {0}; int ret; memset(&tbl, 0, sizeof(struct ctl_table)); diff --git a/net/socket.c b/net/socket.c index dd2c247c99e3..c044d1e8508c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -257,6 +257,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) } init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; + wq->flags = 0; RCU_INIT_POINTER(ei->socket.wq, wq); ei->socket.state = SS_UNCONNECTED; @@ -293,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; @@ -1056,27 +1057,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock or rcu_lock */ +/* This function may be called only under rcu_lock */ -int sock_wake_async(struct socket *sock, int how, int band) +int sock_wake_async(struct socket_wq *wq, int how, int band) { - struct socket_wq *wq; - - if (!sock) - return -1; - rcu_read_lock(); - wq = rcu_dereference(sock->wq); - if (!wq || !wq->fasync_list) { - rcu_read_unlock(); + if (!wq || !wq->fasync_list) return -1; - } + switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ case SOCK_WAKE_IO: @@ -1086,7 +1080,7 @@ call_kill: case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } - rcu_read_unlock(); + return 0; } EXPORT_SYMBOL(sock_wake_async); @@ -1702,6 +1696,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; + msg.msg_iocb = NULL; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); @@ -2046,6 +2041,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (err) break; ++datagrams; + cond_resched(); } fput_light(sock->file, fput_needed); @@ -2241,6 +2237,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, /* Out of band data, return right away */ if (msg_sys.msg_flags & MSG_OOB) break; + cond_resched(); } out_put: diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 59eeed43eda2..f0c6a8c78a56 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net, if (data->found_creds && client_name.data != NULL) { char *c; + data->creds.cr_raw_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + data->creds.cr_principal = kstrndup(client_name.data, client_name.len, GFP_KERNEL); if (data->creds.cr_principal) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5e4f815c2b34..2b32fd602669 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (count == 0) return 0; - mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); @@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } @@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (err == -EAGAIN) goto again; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err ? err : count; } @@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, if (!cd->cache_parse) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return ret; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 23608eb0ded2..b7f21044f4d8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); + break; default: dprintk("RPC: %s: address family not supported\n", __func__); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34558..31789ef3e614 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode) int need_release; LIST_HEAD(free_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; @@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode) cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static struct inode * @@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) int first_open; int res = -ENXIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) pipe->nwriters++; res = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) struct rpc_pipe_msg *msg; int last_close; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; @@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) pipe->ops->destroy_msg(msg); } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct inode *inode = file_inode(filp); int res; - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) poll_wait(filp, &rpci->waitq, wait); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (rpci->pipe == NULL) mask |= POLLERR | POLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= POLLIN | POLLRDNORM; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return mask; } @@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); @@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) len += msg->len - msg->copied; } spin_unlock(&pipe->lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; @@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent, { struct inode *dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); } static int rpc_populate(struct dentry *parent, @@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent, struct dentry *dentry; int i, err; - mutex_lock(&dir->i_mutex); + inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); @@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent, if (err != 0) goto out_bad; } - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; @@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, struct inode *dir = d_inode(parent); int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, goto err_rmdir; } out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); @@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~S_IWUGO; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (err) goto out_err; out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); @@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f14f24ee9983..73ad57a59989 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index bc5b7b5032ca..cc9852897395 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1364,6 +1364,19 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); + /* Adjust the argument buffer length */ + rqstp->rq_arg.len = req->rq_private_buf.len; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; + rqstp->rq_arg.page_len = 0; + } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len) + rqstp->rq_arg.page_len = rqstp->rq_arg.len - + rqstp->rq_arg.head[0].iov_len; + else + rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len; + /* reset result send buffer "put" position */ resv->iov_len = 0; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a6cbb2104667..7422f28818b2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -10,11 +10,13 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <net/sock.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprt.h> #include <linux/module.h> +#include <linux/netdevice.h> #include <trace/events/sunrpc.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +/* Close temporary transports whose xpt_local matches server_addr immediately + * instead of waiting for them to be picked up by the timer. + * + * This is meant to be called from a notifier_block that runs when an ip + * address is deleted. + */ +void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) +{ + struct svc_xprt *xprt; + struct svc_sock *svsk; + struct socket *sock; + struct list_head *le, *next; + LIST_HEAD(to_be_closed); + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + spin_lock_bh(&serv->sv_lock); + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + if (rpc_cmp_addr(server_addr, (struct sockaddr *) + &xprt->xpt_local)) { + dprintk("svc_age_temp_xprts_now: found %p\n", xprt); + list_move(le, &to_be_closed); + } + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_closed)) { + le = to_be_closed.next; + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); + svc_close_xprt(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); + static void call_xpt_users(struct svc_xprt *xprt) { struct svc_xpt_user *u; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 79c0f3459b5c..69841db1f533 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; + init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; return aops->accept(rqstp, authp); @@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { + rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 621ca7b4a155..dfacdc95b3f5 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct kvec *resv = &rqstp->rq_res.head[0]; struct svc_cred *cred = &rqstp->rq_cred; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if (argv->iov_len < 3*4) return SVC_GARBAGE; @@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) u32 slen, i; int len = argv->iov_len; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if ((len -= 3*4) < 0) return SVC_GARBAGE; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e98f4a243e5..37edea6fa92d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } +EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 33f99d3004f2..dc9f3b513a05 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o physical_ops.o \ - svc_rdma.o svc_rdma_transport.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f69e53..cc1251d07297 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -15,7 +15,7 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define RPCRDMA_BACKCHANNEL_DEBUG +#undef RPCRDMA_BACKCHANNEL_DEBUG static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); @@ -84,9 +84,7 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { @@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, break; } - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + rpcrdma_recv_buffer_put(rep); } return rc; @@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) __func__); goto out_free; } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); @@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpclen = rqst->rq_svec[0].iov_len; +#ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); pr_info("RPC: %s: RPC/RDMA: %*ph\n", __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); pr_info("RPC: %s: RPC: %*ph\n", __func__, (int)rpclen, rqst->rq_svec[0].iov_base); +#endif req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; @@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, rpcr_to_rdmar(rqst)); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: using rqst %p\n", __func__, rqst); -#endif + dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * direction reply. */ req = rpcr_to_rdmar(rqst); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: attaching rep %p to req %p\n", + dprintk("RPC: %s: attaching rep %p to req %p\n", __func__, rep, req); -#endif req->rl_reply = rep; /* Defeat the retransmit detection logic in send_request */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8dafbd507..c14f3a4bff68 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -179,6 +179,69 @@ out_maperr: return rc; } +static void +__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + int nsegs = seg->mr_nsegs; + + seg->rl_mw = NULL; + + while (nsegs--) + rpcrdma_unmap_one(device, seg++); + + rpcrdma_put_mw(r_xprt, mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_mw *mw; + LIST_HEAD(unmap_list); + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped MR. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + list_add(&mw->r.fmr.fmr->list, &unmap_list); + + i += seg->mr_nsegs; + } + rc = ib_unmap_fmr(&unmap_list); + if (rc) + pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __fmr_dma_unmap(r_xprt, seg); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 88cf9e7269c2..e16567389e28 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -190,12 +190,11 @@ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; int depth, delta; ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); + ia->ri_device->attrs.max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", __func__, ia->ri_max_frmr_depth); @@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { + cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * @@ -245,12 +244,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs + * to be reset. + * + * WARNING: Only wr_id and status are reliable at this point + */ static void -frwr_sendcompletion(struct ib_wc *wc) +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) { - struct rpcrdma_mw *r; - if (likely(wc->status == IB_WC_SUCCESS)) return; @@ -261,9 +262,23 @@ frwr_sendcompletion(struct ib_wc *wc) else pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; } +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + struct rpcrdma_frmr *f = &r->r.frmr; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + __frwr_sendcompletion_flush(wc, r); + + if (f->fr_waiter) + complete(&f->fr_linv_done); +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -319,7 +334,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; - struct ib_reg_wr reg_wr; + struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n, dma_nents; u8 key; @@ -335,7 +350,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; + frmr->fr_waiter = false; mr = frmr->fr_mr; + reg_wr = &frmr->fr_regwr; if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; @@ -381,19 +398,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = (uintptr_t)mw; - reg_wr.wr.num_sge = 0; - reg_wr.wr.send_flags = 0; - reg_wr.mr = mr; - reg_wr.key = mr->rkey; - reg_wr.access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; + reg_wr->wr.next = NULL; + reg_wr->wr.opcode = IB_WR_REG_MR; + reg_wr->wr.wr_id = (uintptr_t)mw; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = 0; + reg_wr->mr = mr; + reg_wr->key = mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -413,6 +430,116 @@ out_senderr: return rc; } +static struct ib_send_wr * +__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + struct ib_send_wr *invalidate_wr; + + f->fr_waiter = false; + f->fr_state = FRMR_IS_INVALID; + invalidate_wr = &f->fr_invwr; + + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (unsigned long)(void *)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; + + return invalidate_wr; +} + +static void +__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int rc) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + + seg->rl_mw = NULL; + + ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + + if (!rc) + rpcrdma_put_mw(r_xprt, mw); + else + __frwr_queue_recovery(mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_frmr *f; + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + invalidate_wrs = pos = prev = NULL; + seg = NULL; + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + pos = __frwr_prepare_linv_wr(seg); + + if (!invalidate_wrs) + invalidate_wrs = pos; + else + prev->next = pos; + prev = pos; + + i += seg->mr_nsegs; + } + f = &seg->rl_mw->r.frmr; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + f->fr_invwr.send_flags = IB_SEND_SIGNALED; + f->fr_waiter = true; + init_completion(&f->fr_linv_done); + INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + if (rc) + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + wait_for_completion(&f->fr_linv_done); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __frwr_dma_unmap(r_xprt, seg, rc); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -423,23 +550,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_send_wr invalidate_wr, *bad_wr; + struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; + invalidate_wr = &mw->r.frmr.fr_invwr; - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (uintptr_t)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; @@ -471,6 +599,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 617b76f22154..dbb302ecf590 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +/* DMA unmap all memory regions that were mapped for "req". + */ +static void +physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int i; + + for (i = 0; req->rl_nchunks; --req->rl_nchunks) + rpcrdma_unmap_one(device, &req->rl_segments[i++]); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c10d9699441c..0f28f2d743ed 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (req->rl_reply) goto out_duplicate; + /* Sanity checking has passed. We are now committed + * to complete this transaction. + */ + list_del_init(&rqst->rq_list); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", __func__, rep, req, rqst, @@ -888,12 +893,23 @@ badheader: break; } + /* Invalidate and flush the data payloads before waking the + * waiting application. This guarantees the memory region is + * properly fenced from the server before the application + * accesses the data. It also ensures proper send flow + * control: waking the next RPC waits until this RPC has + * relinquished all its Send Queue entries. + */ + if (req->rl_nchunks) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); + credits = be32_to_cpu(headerp->rm_credit); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_max_requests) credits = r_xprt->rx_buf.rb_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1b7051bdbdc8..c846ca9f1eba 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD; static unsigned int min_ord = 1; static unsigned int max_ord = 4096; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; @@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map and context caches */ -struct kmem_cache *svc_rdma_map_cachep; -struct kmem_cache *svc_rdma_ctxt_cachep; - struct workqueue_struct *svc_rdma_wq; /* @@ -243,17 +240,16 @@ void svc_rdma_cleanup(void) svc_unreg_xprt_class(&svc_rdma_bc_class); #endif svc_unreg_xprt_class(&svc_rdma_class); - kmem_cache_destroy(svc_rdma_map_cachep); - kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) { dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %d\n", svcrdma_max_requests); - dprintk("\tsq_depth : %d\n", + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tsq_depth : %u\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); @@ -264,39 +260,10 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); - /* Create the temporary map cache */ - svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", - sizeof(struct svc_rdma_req_map), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_map_cachep) { - printk(KERN_INFO "Could not allocate map cache.\n"); - goto err0; - } - - /* Create the temporary context cache */ - svc_rdma_ctxt_cachep = - kmem_cache_create("svc_rdma_ctxt_cache", - sizeof(struct svc_rdma_op_ctxt), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_ctxt_cachep) { - printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); - goto err1; - } - /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); #if defined(CONFIG_SUNRPC_BACKCHANNEL) svc_reg_xprt_class(&svc_rdma_bc_class); #endif return 0; - err1: - kmem_cache_destroy(svc_rdma_map_cachep); - err0: - unregister_sysctl_table(svcrdma_table_header); - destroy_workqueue(svc_rdma_wq); - return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 000000000000..65a7c232a345 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = rmsgp->rm_xid; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock_bh(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock_bh(&xprt->transport_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_reply, but takes an rpc_rqst + * instead, does not support chunks, and avoids blocking memory + * allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst) +{ + struct xdr_buf *sndbuf = &rqst->rq_snd_buf; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + struct ib_send_wr send_wr; + int ret; + + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + if (ret) + goto out_err; + + /* Post a recv buffer to handle the reply for this request. */ + ret = svc_rdma_post_recv(rdma, GFP_NOIO); + if (ret) { + pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + ret = -ENOTCONN; + goto out_err; + } + + ctxt = svc_rdma_get_context(rdma); + ctxt->pages[0] = virt_to_page(rqst->rq_buffer); + ctxt->count = 1; + + ctxt->wr_op = IB_WR_SEND; + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; + ctxt->sge[0].length = sndbuf->len; + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, + sndbuf->len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { + ret = -EIO; + goto out_unmap; + } + atomic_inc(&rdma->sc_dma_used); + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) { + ret = -EIO; + goto out_unmap; + } + +out_err: + svc_rdma_put_req_map(rdma, vec); + dprintk("svcrdma: %s returns %d\n", __func__, ret); + return ret; + +out_unmap: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + goto out_err; +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + * + * Returns NULL if there was a temporary allocation failure. + */ +static void * +xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + struct page *page; + + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + + /* Prevent an infinite loop: try to make this case work */ + if (size > PAGE_SIZE) + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return NULL; + + return page_address(page); +} + +static void +xprt_rdma_bc_free(void *buffer) +{ + /* No-op: ctxt and page have already been freed. */ +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + int rc; + + /* Space in the send buffer for an RPC/RDMA header is reserved + * via xprt->tsh_size. + */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst); + if (rc) + goto drop_connection; + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e527ec..c8b8a8b4181e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; head->arg.page_len += len; + head->arg.len += len; if (!pg_off) head->count++; @@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ - ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; @@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp, return ret; } +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +{ + __be32 *p = (__be32 *)rmsgp; + + if (!xprt->xpt_bc_xprt) + return false; + + if (rmsgp->rm_type != rdma_msg) + return false; + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != rmsgp->rm_xid) + return false; + /* call direction */ + if (p[8] == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + /* * Set up the rqstp thread context to point to the RQ buffer. If * necessary, pull additional data from the client with an RDMA_READ @@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } + if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + &rqstp->rq_arg); + svc_rdma_put_context(ctxt, 0); + if (ret) + goto repost; + return ret; + } + /* Read read-list data. */ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { @@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) set_bit(XPT_CLOSE, &xprt->xpt_flags); defer: return 0; + +repost: + ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma_xprt); + set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab75fc3..df57f3ce6cd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, if (xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + pr_err("svcrdma: %s: XDR buffer length error\n", __func__); return -EIO; } @@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + dprintk("svcrdma: %s: sge_no %d page_no %d " "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, + __func__, sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); vec->count = sge_no; @@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].addr)) goto err; atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; + sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; sge_no++; @@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int ret; /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma); + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) { printk(KERN_INFO "svcrdma: could not post a receive buffer, err=%d." @@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, @@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr)) goto err; atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } if (byte_count != 0) { @@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - vec = svc_rdma_get_req_map(); - ret = map_xdr(rdma, &rqstp->rq_res, vec); + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + ret = -ENOMEM; + res_page = alloc_page(GFP_KERNEL); + if (!res_page) + goto err0; rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) @@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; err1: put_page(res_page); err0: - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b348b4adef29..5763825d09bf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, + gfp_t flags) { struct svc_rdma_op_ctxt *ctxt; - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, - GFP_KERNEL | __GFP_NOFAIL); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); + ctxt = kmalloc(sizeof(*ctxt), flags); + if (ctxt) { + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->free); + INIT_LIST_HEAD(&ctxt->dto_q); + } + return ctxt; +} + +static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* Each RPC/RDMA credit can consume a number of send + * and receive WQEs. One ctxt is allocated for each. + */ + i = xprt->sc_sq_depth + xprt->sc_rq_depth; + + while (i--) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = alloc_ctxt(xprt, GFP_KERNEL); + if (!ctxt) { + dprintk("svcrdma: No memory for RDMA ctxt\n"); + return false; + } + list_add(&ctxt->free, &xprt->sc_ctxts); + } + return true; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used++; + if (list_empty(&xprt->sc_ctxts)) + goto out_empty; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del_init(&ctxt->free); + spin_unlock_bh(&xprt->sc_ctxt_lock); + +out: ctxt->count = 0; ctxt->frmr = NULL; - atomic_inc(&xprt->sc_ctxt_used); return ctxt; + +out_empty: + /* Either pre-allocation missed the mark, or send + * queue accounting is broken. + */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = alloc_ctxt(xprt, GFP_NOIO); + if (ctxt) + goto out; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + spin_unlock_bh(&xprt->sc_ctxt_lock); + WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); + return NULL; } void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) @@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches - * the sc_dma_lkey, otherwise, ignore it since it is + * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, @@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) { - struct svcxprt_rdma *xprt; + struct svcxprt_rdma *xprt = ctxt->xprt; int i; - xprt = ctxt->xprt; if (free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); - atomic_dec(&xprt->sc_ctxt_used); + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + list_add(&ctxt->free, &xprt->sc_ctxts); + spin_unlock_bh(&xprt->sc_ctxt_lock); } -/* - * Temporary NFS req mappings are shared across all transport - * instances. These are short lived and should be bounded by the number - * of concurrent server threads * depth of the SQ. - */ -struct svc_rdma_req_map *svc_rdma_get_req_map(void) +static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) +{ + while (!list_empty(&xprt->sc_ctxts)) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del(&ctxt->free); + kfree(ctxt); + } +} + +static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) { struct svc_rdma_req_map *map; - map = kmem_cache_alloc(svc_rdma_map_cachep, - GFP_KERNEL | __GFP_NOFAIL); + + map = kmalloc(sizeof(*map), flags); + if (map) + INIT_LIST_HEAD(&map->free); + return map; +} + +static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* One for each receive buffer on this connection. */ + i = xprt->sc_max_requests; + + while (i--) { + struct svc_rdma_req_map *map; + + map = alloc_req_map(GFP_KERNEL); + if (!map) { + dprintk("svcrdma: No memory for request map\n"); + return false; + } + list_add(&map->free, &xprt->sc_maps); + } + return true; +} + +struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_req_map *map = NULL; + + spin_lock(&xprt->sc_map_lock); + if (list_empty(&xprt->sc_maps)) + goto out_empty; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del_init(&map->free); + spin_unlock(&xprt->sc_map_lock); + +out: map->count = 0; return map; + +out_empty: + spin_unlock(&xprt->sc_map_lock); + + /* Pre-allocation amount was incorrect */ + map = alloc_req_map(GFP_NOIO); + if (map) + goto out; + + WARN_ONCE(1, "svcrdma: empty request map list?\n"); + return NULL; +} + +void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, + struct svc_rdma_req_map *map) +{ + spin_lock(&xprt->sc_map_lock); + list_add(&map->free, &xprt->sc_maps); + spin_unlock(&xprt->sc_map_lock); } -void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) { - kmem_cache_free(svc_rdma_map_cachep, map); + while (!list_empty(&xprt->sc_maps)) { + struct svc_rdma_req_map *map; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del(&map->free); + kfree(map); + } } /* ib_cq event handler */ @@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) static void process_context(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *ctxt) { + struct svc_rdma_op_ctxt *read_hdr; + int free_pages = 0; + svc_rdma_unmap_dma(ctxt); switch (ctxt->wr_op) { case IB_WR_SEND: - if (ctxt->frmr) - pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 1); + free_pages = 1; break; case IB_WR_RDMA_WRITE: - if (ctxt->frmr) - pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: svc_rdma_put_frmr(xprt, ctxt->frmr); - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - if (read_hdr) { - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - } else { - pr_err("svcrdma: ctxt->read_hdr == NULL\n"); - } - svc_xprt_enqueue(&xprt->sc_xprt); - } + + if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) + break; + + read_hdr = ctxt->read_hdr; svc_rdma_put_context(ctxt, 0); - break; + + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + return; default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d\n", - ctxt->wr_op); + dprintk("svcrdma: unexpected completion opcode=%d\n", + ctxt->wr_op); break; } + + svc_rdma_put_context(ctxt, free_pages); } /* @@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_maps); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); - - cma_xprt->sc_ord = svcrdma_ord; - - cma_xprt->sc_max_req_size = svcrdma_max_req_size; - cma_xprt->sc_max_requests = svcrdma_max_requests; - cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; - atomic_set(&cma_xprt->sc_sq_count, 0); - atomic_set(&cma_xprt->sc_ctxt_used, 0); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_map_lock); if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + page = alloc_page(flags); + if (!page) + goto err_put_ctxt; ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } @@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; - int ret; - int i; + struct ib_device *dev; + unsigned int i; + int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", newxprt, newxprt->sc_cm_id); - ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); - if (ret) { - dprintk("svcrdma: could not query device attributes on " - "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); - goto errout; - } + dev = newxprt->sc_cm_id->device; /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)devattr.max_sge, + newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, RPCSVC_MAXPAGES); - newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, - (size_t)svcrdma_max_requests); - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_requests); + newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_bc_requests); + newxprt->sc_rq_depth = newxprt->sc_max_requests + + newxprt->sc_max_bc_requests; + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + + if (!svc_rdma_prealloc_ctxts(newxprt)) + goto errout; + if (!svc_rdma_prealloc_maps(newxprt)) + goto errout; /* * Limit ORD based on client limit, local device limit, and * configured svcrdma limit. */ - newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + newxprt->sc_pd = ib_alloc_pd(dev); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } cq_attr.cqe = newxprt->sc_sq_depth; - newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + newxprt->sc_sq_cq = ib_create_cq(dev, sq_comp_handler, cq_event_handler, newxprt, @@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_max_requests; - newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + cq_attr.cqe = newxprt->sc_rq_depth; + newxprt->sc_rq_cq = ib_create_cq(dev, rq_comp_handler, cq_event_handler, newxprt, @@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; - qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " cap.max_send_sge = %d\n" " cap.max_recv_sge = %d\n", newxprt->sc_cm_id, newxprt->sc_pd, - newxprt->sc_cm_id->device, newxprt->sc_pd->device, + dev, newxprt->sc_pd->device, qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr, qp_attr.cap.max_send_sge, @@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * of an RDMA_READ. IB does not. */ newxprt->sc_reader = rdma_read_chunk_lcl; - if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { newxprt->sc_frmr_pg_list_len = - devattr.max_fast_reg_page_list_len; + dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; } @@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) goto errout; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; - } - - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = - newxprt->sc_cm_id->device->local_dma_lkey; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); + for (i = 0; i < newxprt->sc_rq_depth; i++) { + ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; @@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + struct svc_xprt *xprt = &rdma->sc_xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, rdma); /* We should only be called from kref_put */ - if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); + atomic_read(&xprt->xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work) } /* Warn if we leaked a resource or under-referenced */ - if (atomic_read(&rdma->sc_ctxt_used) != 0) + if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", - atomic_read(&rdma->sc_ctxt_used)); + rdma->sc_ctxt_used); if (atomic_read(&rdma->sc_dma_used) != 0) pr_err("svcrdma: dma still in use? (%d)\n", atomic_read(&rdma->sc_dma_used)); - /* De-allocate fastreg mr */ + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_ctxts(rdma); + svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, int length; int ret; - p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + p = alloc_page(GFP_KERNEL); + if (!p) + return; va = page_address(p); /* XDR encode error */ @@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, return; } atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[0].length = length; /* Prepare SEND WR */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..b1b009f10ea3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -63,7 +63,7 @@ */ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; -static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; @@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = { #endif -#define RPCRDMA_BIND_TO (60U * HZ) -#define RPCRDMA_INIT_REEST_TO (5U * HZ) -#define RPCRDMA_MAX_REEST_TO (30U * HZ) -#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) - -static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; } -static void +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { char buf[128]; @@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } -static void +void xprt_rdma_free_addresses(struct rpc_xprt *xprt) { unsigned int i; @@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; - flags = GFP_NOIO | __GFP_NOWARN; + flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -576,6 +571,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); @@ -639,7 +637,7 @@ drop_connection: return -ENOTCONN; /* implies disconnect */ } -static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); long idle_time = 0; @@ -740,6 +738,11 @@ void xprt_rdma_cleanup(void) rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); } int xprt_rdma_init(void) @@ -763,6 +766,14 @@ int xprt_rdma_init(void) return rc; } + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655145a..878f1bfb1db9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -462,7 +462,6 @@ int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { struct rpcrdma_ia *ia = &xprt->rx_ia; - struct ib_device_attr *devattr = &ia->ri_devattr; int rc; ia->ri_dma_mr = NULL; @@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_device, devattr); - if (rc) { - dprintk("RPC: %s: ib_query_device failed %d\n", - __func__, rc); - goto out3; - } - if (memreg == RPCRDMA_FRMR) { - if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(ia->ri_device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS) || + (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; @@ -566,24 +559,23 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; struct ib_cq_init_attr cq_attr = {}; unsigned int max_qp_wr; int rc, err; - if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; } - if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", __func__); return -ENOMEM; } - max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -616,10 +608,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) - ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; - else if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; /* always signal? */ INIT_CQCOUNT(ep); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); @@ -670,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = - devattr->max_qp_rd_atom; + ia->ri_device->attrs.max_qp_rd_atom; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; @@ -852,10 +842,11 @@ retry: if (extras) { rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) + if (rc) { pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", __func__, rc); rc = 0; + } } } @@ -1337,15 +1328,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1355,7 +1345,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4f632a..38fe11b09875 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -55,6 +55,11 @@ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + /* * Interface Adapter -- one per transport instance */ @@ -68,7 +73,6 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; - struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -88,12 +92,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -/* - * Force a signaled SEND Work Request every so often, - * in case the provider needs to do some housekeeping. - */ -#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) - #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) @@ -148,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -207,6 +207,12 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + bool fr_waiter; + struct completion fr_linv_done;; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; }; struct rpcrdma_fmr { @@ -309,6 +315,8 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ struct list_head rb_allreqs; + + u32 rb_bc_max_requests; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -364,6 +372,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); int (*ro_open)(struct rpcrdma_ia *, @@ -514,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *); /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -529,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* Temporary NFS request map cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_map_cachep; -/* WR context cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_ctxt_cachep; -/* Workqueue created in svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; +extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1d1a70498910..fde2138b81e7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -616,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -626,7 +612,10 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** @@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else -static inline void xs_reclassify_socketu(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket4(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket6(struct socket *sock) -{ -} - static inline void xs_reclassify_socket(int family, struct socket *sock) { } @@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) "transport socket (%d).\n", -status); goto out; } - xs_reclassify_socketu(sock); + xs_reclassify_socket(AF_LOCAL, sock); dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index f34e535e93bd..ebc661d3b6e3 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -345,6 +345,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) return sizeof(struct switchdev_obj_ipv4_fib); case SWITCHDEV_OBJ_ID_PORT_FDB: return sizeof(struct switchdev_obj_port_fdb); + case SWITCHDEV_OBJ_ID_PORT_MDB: + return sizeof(struct switchdev_obj_port_mdb); default: BUG(); } @@ -723,6 +725,7 @@ static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev, u32 filter_mask) { struct switchdev_vlan_dump dump = { + .vlan.obj.orig_dev = dev, .vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .skb = skb, .filter_mask = filter_mask, @@ -757,6 +760,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, int nlflags) { struct switchdev_attr attr = { + .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, }; u16 mode = BRIDGE_MODE_UNDEF; @@ -778,6 +782,7 @@ static int switchdev_port_br_setflag(struct net_device *dev, unsigned long brport_flag) { struct switchdev_attr attr = { + .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, }; u8 flag = nla_get_u8(nlattr); @@ -853,6 +858,7 @@ static int switchdev_port_br_afspec(struct net_device *dev, struct nlattr *attr; struct bridge_vlan_info *vinfo; struct switchdev_obj_port_vlan vlan = { + .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, }; int rem; @@ -975,6 +981,7 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], u16 vid, u16 nlm_flags) { struct switchdev_obj_port_fdb fdb = { + .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, .vid = vid, }; @@ -1000,6 +1007,7 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], u16 vid) { struct switchdev_obj_port_fdb fdb = { + .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, .vid = vid, }; @@ -1077,6 +1085,7 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *filter_dev, int idx) { struct switchdev_fdb_dump dump = { + .fdb.obj.orig_dev = dev, .fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB, .dev = dev, .skb = skb, @@ -1135,6 +1144,7 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (!dev) return NULL; + attr.orig_dev = dev; if (switchdev_port_attr_get(dev, &attr)) return NULL; @@ -1194,6 +1204,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, if (!dev) return 0; + ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_add(dev, &ipv4_fib.obj); if (!err) fi->fib_flags |= RTNH_F_OFFLOAD; @@ -1238,6 +1249,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, if (!dev) return 0; + ipv4_fib.obj.orig_dev = dev; err = switchdev_port_obj_del(dev, &ipv4_fib.obj); if (!err) fi->fib_flags &= ~RTNH_F_OFFLOAD; @@ -1270,10 +1282,12 @@ static bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct switchdev_attr a_attr = { + .orig_dev = a, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; struct switchdev_attr b_attr = { + .orig_dev = b, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, .flags = SWITCHDEV_F_NO_RECURSE, }; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 9dc239dfe192..e401108360a2 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -332,131 +332,15 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) tipc_sk_rcv(net, inputq); } -static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, - struct tipc_stats *stats) -{ - int i; - struct nlattr *nest; - - struct nla_map { - __u32 key; - __u32 val; - }; - - struct nla_map map[] = { - {TIPC_NLA_STATS_RX_INFO, stats->recv_info}, - {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments}, - {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented}, - {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles}, - {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled}, - {TIPC_NLA_STATS_TX_INFO, stats->sent_info}, - {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments}, - {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented}, - {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles}, - {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled}, - {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks}, - {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv}, - {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks}, - {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks}, - {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted}, - {TIPC_NLA_STATS_DUPLICATES, stats->duplicates}, - {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs}, - {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz}, - {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ? - (stats->accu_queue_sz / stats->queue_sz_counts) : 0} - }; - - nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS); - if (!nest) - return -EMSGSIZE; - - for (i = 0; i < ARRAY_SIZE(map); i++) - if (nla_put_u32(skb, map[i].key, map[i].val)) - goto msg_full; - - nla_nest_end(skb, nest); - - return 0; -msg_full: - nla_nest_cancel(skb, nest); - - return -EMSGSIZE; -} - -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) -{ - int err; - void *hdr; - struct nlattr *attrs; - struct nlattr *prop; - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; - - if (!bcl) - return 0; - - tipc_bcast_lock(net); - - hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, - NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) - return -EMSGSIZE; - - attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); - if (!attrs) - goto msg_full; - - /* The broadcast link is always up */ - if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) - goto attr_msg_full; - - if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST)) - goto attr_msg_full; - if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) - goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) - goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) - goto attr_msg_full; - - prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); - if (!prop) - goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) - goto prop_msg_full; - nla_nest_end(msg->skb, prop); - - err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats); - if (err) - goto attr_msg_full; - - tipc_bcast_unlock(net); - nla_nest_end(msg->skb, attrs); - genlmsg_end(msg->skb, hdr); - - return 0; - -prop_msg_full: - nla_nest_cancel(msg->skb, prop); -attr_msg_full: - nla_nest_cancel(msg->skb, attrs); -msg_full: - tipc_bcast_unlock(net); - genlmsg_cancel(msg->skb, hdr); - - return -EMSGSIZE; -} - int tipc_bclink_reset_stats(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *bcl = tn->bcl; + struct tipc_link *l = tipc_bc_sndlink(net); - if (!bcl) + if (!l) return -ENOPROTOOPT; tipc_bcast_lock(net); - memset(&bcl->stats, 0, sizeof(bcl->stats)); + tipc_link_reset_stats(l); tipc_bcast_unlock(net); return 0; } @@ -530,9 +414,7 @@ enomem: void tipc_bcast_reinit(struct net *net) { - struct tipc_bc_base *b = tipc_bc_base(net); - - msg_set_prevnode(b->link->pmsg, tipc_own_addr(net)); + tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net)); } void tipc_bcast_stop(struct net *net) diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 2855b9356a15..1944c6c00bb9 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -43,6 +43,7 @@ struct tipc_node; struct tipc_msg; struct tipc_nl_msg; struct tipc_node_map; +extern const char tipc_bclink_name[]; int tipc_bcast_init(struct net *net); void tipc_bcast_reinit(struct net *net); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 648f2a67f314..802ffad3200d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -71,7 +71,7 @@ static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } }; -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr); +static void bearer_disable(struct net *net, struct tipc_bearer *b); /** * tipc_media_find - locates specified media object by name @@ -107,13 +107,13 @@ static struct tipc_media *media_find_id(u8 type) void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a) { char addr_str[MAX_ADDR_STR]; - struct tipc_media *m_ptr; + struct tipc_media *m; int ret; - m_ptr = media_find_id(a->media_id); + m = media_find_id(a->media_id); - if (m_ptr && !m_ptr->addr2str(a, addr_str, sizeof(addr_str))) - ret = scnprintf(buf, len, "%s(%s)", m_ptr->name, addr_str); + if (m && !m->addr2str(a, addr_str, sizeof(addr_str))) + ret = scnprintf(buf, len, "%s(%s)", m->name, addr_str); else { u32 i; @@ -175,13 +175,13 @@ static int bearer_name_validate(const char *name, struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; u32 i; for (i = 0; i < MAX_BEARERS; i++) { - b_ptr = rtnl_dereference(tn->bearer_list[i]); - if (b_ptr && (!strcmp(b_ptr->name, name))) - return b_ptr; + b = rtnl_dereference(tn->bearer_list[i]); + if (b && (!strcmp(b->name, name))) + return b; } return NULL; } @@ -189,24 +189,24 @@ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name) void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (b_ptr) - tipc_disc_add_dest(b_ptr->link_req); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (b) + tipc_disc_add_dest(b->link_req); rcu_read_unlock(); } void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (b_ptr) - tipc_disc_remove_dest(b_ptr->link_req); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (b) + tipc_disc_remove_dest(b->link_req); rcu_read_unlock(); } @@ -218,8 +218,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, struct nlattr *attr[]) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; - struct tipc_media *m_ptr; + struct tipc_bearer *b; + struct tipc_media *m; struct tipc_bearer_names b_names; char addr_string[16]; u32 bearer_id; @@ -255,31 +255,31 @@ static int tipc_enable_bearer(struct net *net, const char *name, return -EINVAL; } - m_ptr = tipc_media_find(b_names.media_name); - if (!m_ptr) { + m = tipc_media_find(b_names.media_name); + if (!m) { pr_warn("Bearer <%s> rejected, media <%s> not registered\n", name, b_names.media_name); return -EINVAL; } if (priority == TIPC_MEDIA_LINK_PRI) - priority = m_ptr->priority; + priority = m->priority; restart: bearer_id = MAX_BEARERS; with_this_prio = 1; for (i = MAX_BEARERS; i-- != 0; ) { - b_ptr = rtnl_dereference(tn->bearer_list[i]); - if (!b_ptr) { + b = rtnl_dereference(tn->bearer_list[i]); + if (!b) { bearer_id = i; continue; } - if (!strcmp(name, b_ptr->name)) { + if (!strcmp(name, b->name)) { pr_warn("Bearer <%s> rejected, already enabled\n", name); return -EINVAL; } - if ((b_ptr->priority == priority) && + if ((b->priority == priority) && (++with_this_prio > 2)) { if (priority-- == 0) { pr_warn("Bearer <%s> rejected, duplicate priority\n", @@ -297,35 +297,35 @@ restart: return -EINVAL; } - b_ptr = kzalloc(sizeof(*b_ptr), GFP_ATOMIC); - if (!b_ptr) + b = kzalloc(sizeof(*b), GFP_ATOMIC); + if (!b) return -ENOMEM; - strcpy(b_ptr->name, name); - b_ptr->media = m_ptr; - res = m_ptr->enable_media(net, b_ptr, attr); + strcpy(b->name, name); + b->media = m; + res = m->enable_media(net, b, attr); if (res) { pr_warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res); return -EINVAL; } - b_ptr->identity = bearer_id; - b_ptr->tolerance = m_ptr->tolerance; - b_ptr->window = m_ptr->window; - b_ptr->domain = disc_domain; - b_ptr->net_plane = bearer_id + 'A'; - b_ptr->priority = priority; + b->identity = bearer_id; + b->tolerance = m->tolerance; + b->window = m->window; + b->domain = disc_domain; + b->net_plane = bearer_id + 'A'; + b->priority = priority; - res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr); + res = tipc_disc_create(net, b, &b->bcast_addr); if (res) { - bearer_disable(net, b_ptr); + bearer_disable(net, b); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", name); return -EINVAL; } - rcu_assign_pointer(tn->bearer_list[bearer_id], b_ptr); + rcu_assign_pointer(tn->bearer_list[bearer_id], b); pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, @@ -336,11 +336,11 @@ restart: /** * tipc_reset_bearer - Reset all links established over this bearer */ -static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) +static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) { - pr_info("Resetting bearer <%s>\n", b_ptr->name); - tipc_node_delete_links(net, b_ptr->identity); - tipc_disc_reset(net, b_ptr); + pr_info("Resetting bearer <%s>\n", b->name); + tipc_node_delete_links(net, b->identity); + tipc_disc_reset(net, b); return 0; } @@ -349,26 +349,26 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) * * Note: This routine assumes caller holds RTNL lock. */ -static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) +static void bearer_disable(struct net *net, struct tipc_bearer *b) { struct tipc_net *tn = net_generic(net, tipc_net_id); u32 i; - pr_info("Disabling bearer <%s>\n", b_ptr->name); - b_ptr->media->disable_media(b_ptr); + pr_info("Disabling bearer <%s>\n", b->name); + b->media->disable_media(b); - tipc_node_delete_links(net, b_ptr->identity); - RCU_INIT_POINTER(b_ptr->media_ptr, NULL); - if (b_ptr->link_req) - tipc_disc_delete(b_ptr->link_req); + tipc_node_delete_links(net, b->identity); + RCU_INIT_POINTER(b->media_ptr, NULL); + if (b->link_req) + tipc_disc_delete(b->link_req); for (i = 0; i < MAX_BEARERS; i++) { - if (b_ptr == rtnl_dereference(tn->bearer_list[i])) { + if (b == rtnl_dereference(tn->bearer_list[i])) { RCU_INIT_POINTER(tn->bearer_list[i], NULL); break; } } - kfree_rcu(b_ptr, rcu); + kfree_rcu(b, rcu); } int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, @@ -411,7 +411,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) /** * tipc_l2_send_msg - send a TIPC packet out over an L2 interface * @buf: the packet to be sent - * @b_ptr: the bearer through which the packet is to be sent + * @b: the bearer through which the packet is to be sent * @dest: peer destination address */ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, @@ -532,14 +532,14 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(dev->tipc_ptr); - if (likely(b_ptr)) { + b = rcu_dereference_rtnl(dev->tipc_ptr); + if (likely(b)) { if (likely(buf->pkt_type <= PACKET_BROADCAST)) { buf->next = NULL; - tipc_rcv(dev_net(dev), buf, b_ptr); + tipc_rcv(dev_net(dev), buf, b); rcu_read_unlock(); return NET_RX_SUCCESS; } @@ -564,13 +564,13 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; - b_ptr = rtnl_dereference(dev->tipc_ptr); - if (!b_ptr) + b = rtnl_dereference(dev->tipc_ptr); + if (!b) return NOTIFY_DONE; - b_ptr->mtu = dev->mtu; + b->mtu = dev->mtu; switch (evt) { case NETDEV_CHANGE: @@ -578,16 +578,16 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_GOING_DOWN: case NETDEV_CHANGEMTU: - tipc_reset_bearer(net, b_ptr); + tipc_reset_bearer(net, b); break; case NETDEV_CHANGEADDR: - b_ptr->media->raw2addr(b_ptr, &b_ptr->addr, + b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); - tipc_reset_bearer(net, b_ptr); + tipc_reset_bearer(net, b); break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b_ptr); + bearer_disable(dev_net(dev), b); break; } return NOTIFY_OK; @@ -623,13 +623,13 @@ void tipc_bearer_cleanup(void) void tipc_bearer_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct tipc_bearer *b; u32 i; for (i = 0; i < MAX_BEARERS; i++) { - b_ptr = rtnl_dereference(tn->bearer_list[i]); - if (b_ptr) { - bearer_disable(net, b_ptr); + b = rtnl_dereference(tn->bearer_list[i]); + if (b) { + bearer_disable(net, b); tn->bearer_list[i] = NULL; } } diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 552185bc4773..e31820516774 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -103,11 +103,11 @@ struct tipc_bearer; */ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, - struct tipc_bearer *b_ptr, + struct tipc_bearer *b, struct tipc_media_addr *dest); - int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr, + int (*enable_media)(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]); - void (*disable_media)(struct tipc_bearer *b_ptr); + void (*disable_media)(struct tipc_bearer *b); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, int bufsz); @@ -176,7 +176,7 @@ struct tipc_bearer_names { * TIPC routines available to supported media types */ -void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr); +void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b); /* * Routines made available to TIPC by supported media types diff --git a/net/tipc/core.h b/net/tipc/core.h index 18e95a8020cd..5504d63503df 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -118,6 +118,11 @@ static inline int tipc_netid(struct net *net) return tipc_net(net)->net_id; } +static inline struct list_head *tipc_nodes(struct net *net) +{ + return &tipc_net(net)->node_list; +} + static inline u16 mod(u16 x) { return x & 0xffffu; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index afe8c47c4085..f1e738e80535 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -75,14 +75,14 @@ struct tipc_link_req { * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @type: message type (request or response) - * @b_ptr: ptr to bearer issuing message + * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, - struct tipc_bearer *b_ptr) + struct tipc_bearer *b) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_msg *msg; - u32 dest_domain = b_ptr->domain; + u32 dest_domain = b->domain; msg = buf_msg(buf); tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type, @@ -92,16 +92,16 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(msg, dest_domain); msg_set_bc_netid(msg, tn->net_id); - b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr); + b->media->addr2msg(msg_media_addr(msg), &b->addr); } /** * disc_dupl_alert - issue node address duplication alert - * @b_ptr: pointer to bearer detecting duplication + * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ -static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, +static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char node_addr_str[16]; @@ -111,7 +111,7 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str, - media_addr_str, b_ptr->name); + media_addr_str, b->name); } /** @@ -261,13 +261,13 @@ exit: /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace - * @b_ptr: ptr to bearer issuing requests + * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @dest_domain: network domain to which links can be established * * Returns 0 if successful, otherwise -errno. */ -int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, +int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest) { struct tipc_link_req *req; @@ -282,17 +282,17 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, return -ENOMEM; } - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr); + tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); memcpy(&req->dest, dest, sizeof(*dest)); req->net = net; - req->bearer_id = b_ptr->identity; - req->domain = b_ptr->domain; + req->bearer_id = b->identity; + req->domain = b->domain; req->num_nodes = 0; req->timer_intv = TIPC_LINK_REQ_INIT; spin_lock_init(&req->lock); setup_timer(&req->timer, disc_timeout, (unsigned long)req); mod_timer(&req->timer, jiffies + req->timer_intv); - b_ptr->link_req = req; + b->link_req = req; skb = skb_clone(req->buf, GFP_ATOMIC); if (skb) tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); @@ -313,19 +313,19 @@ void tipc_disc_delete(struct tipc_link_req *req) /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace - * @b_ptr: ptr to bearer issuing requests + * @b: ptr to bearer issuing requests * @dest_domain: network domain to which links can be established */ -void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr) +void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { - struct tipc_link_req *req = b_ptr->link_req; + struct tipc_link_req *req = b->link_req; struct sk_buff *skb; spin_lock_bh(&req->lock); - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr); + tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); req->net = net; - req->bearer_id = b_ptr->identity; - req->domain = b_ptr->domain; + req->bearer_id = b->identity; + req->domain = b->domain; req->num_nodes = 0; req->timer_intv = TIPC_LINK_REQ_INIT; mod_timer(&req->timer, jiffies + req->timer_intv); diff --git a/net/tipc/link.c b/net/tipc/link.c index 9efbdbde2b08..0c2944fb9ae0 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -45,28 +45,156 @@ #include <linux/pkt_sched.h> +struct tipc_stats { + u32 sent_info; /* used in counting # sent packets */ + u32 recv_info; /* used in counting # recv'd packets */ + u32 sent_states; + u32 recv_states; + u32 sent_probes; + u32 recv_probes; + u32 sent_nacks; + u32 recv_nacks; + u32 sent_acks; + u32 sent_bundled; + u32 sent_bundles; + u32 recv_bundled; + u32 recv_bundles; + u32 retransmitted; + u32 sent_fragmented; + u32 sent_fragments; + u32 recv_fragmented; + u32 recv_fragments; + u32 link_congs; /* # port sends blocked by congestion */ + u32 deferred_recv; + u32 duplicates; + u32 max_queue_sz; /* send queue size high water mark */ + u32 accu_queue_sz; /* used for send queue size profiling */ + u32 queue_sz_counts; /* used for send queue size profiling */ + u32 msg_length_counts; /* used for message length profiling */ + u32 msg_lengths_total; /* used for message length profiling */ + u32 msg_length_profile[7]; /* used for msg. length profiling */ +}; + +/** + * struct tipc_link - TIPC link data structure + * @addr: network address of link's peer node + * @name: link name character string + * @media_addr: media address to use when sending messages over link + * @timer: link timer + * @net: pointer to namespace struct + * @refcnt: reference counter for permanent references (owner node & timer) + * @peer_session: link session # being used by peer end of link + * @peer_bearer_id: bearer id used by link's peer endpoint + * @bearer_id: local bearer id used by link + * @tolerance: minimum link continuity loss needed to reset link [in ms] + * @keepalive_intv: link keepalive timer interval + * @abort_limit: # of unacknowledged continuity probes needed to reset link + * @state: current state of link FSM + * @peer_caps: bitmap describing capabilities of peer node + * @silent_intv_cnt: # of timer intervals without any reception from peer + * @proto_msg: template for control messages generated by link + * @pmsg: convenience pointer to "proto_msg" field + * @priority: current link priority + * @net_plane: current link network plane ('A' through 'H') + * @backlog_limit: backlog queue congestion thresholds (indexed by importance) + * @exp_msg_count: # of tunnelled messages expected during link changeover + * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset + * @mtu: current maximum packet size for this link + * @advertised_mtu: advertised own mtu when link is being established + * @transmitq: queue for sent, non-acked messages + * @backlogq: queue for messages waiting to be sent + * @snt_nxt: next sequence number to use for outbound messages + * @last_retransmitted: sequence number of most recently retransmitted message + * @stale_count: # of identical retransmit requests made by peer + * @ackers: # of peers that needs to ack each packet before it can be released + * @acked: # last packet acked by a certain peer. Used for broadcast. + * @rcv_nxt: next sequence number to expect for inbound messages + * @deferred_queue: deferred queue saved OOS b'cast message received from node + * @unacked_window: # of inbound messages rx'd without ack'ing back to peer + * @inputq: buffer queue for messages to be delivered upwards + * @namedq: buffer queue for name table messages to be delivered upwards + * @next_out: ptr to first unsent outbound message in queue + * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate + * @long_msg_seq_no: next identifier to use for outbound fragmented messages + * @reasm_buf: head of partially reassembled inbound message fragments + * @bc_rcvr: marks that this is a broadcast receiver link + * @stats: collects statistics regarding link activity + */ +struct tipc_link { + u32 addr; + char name[TIPC_MAX_LINK_NAME]; + struct tipc_media_addr *media_addr; + struct net *net; + + /* Management and link supervision data */ + u32 peer_session; + u32 peer_bearer_id; + u32 bearer_id; + u32 tolerance; + unsigned long keepalive_intv; + u32 abort_limit; + u32 state; + u16 peer_caps; + bool active; + u32 silent_intv_cnt; + struct { + unchar hdr[INT_H_SIZE]; + unchar body[TIPC_MAX_IF_NAME]; + } proto_msg; + struct tipc_msg *pmsg; + u32 priority; + char net_plane; + + /* Failover/synch */ + u16 drop_point; + struct sk_buff *failover_reasm_skb; + + /* Max packet negotiation */ + u16 mtu; + u16 advertised_mtu; + + /* Sending */ + struct sk_buff_head transmq; + struct sk_buff_head backlogq; + struct { + u16 len; + u16 limit; + } backlog[5]; + u16 snd_nxt; + u16 last_retransm; + u16 window; + u32 stale_count; + + /* Reception */ + u16 rcv_nxt; + u32 rcv_unacked; + struct sk_buff_head deferdq; + struct sk_buff_head *inputq; + struct sk_buff_head *namedq; + + /* Congestion handling */ + struct sk_buff_head wakeupq; + + /* Fragmentation/reassembly */ + struct sk_buff *reasm_buf; + + /* Broadcast */ + u16 ackers; + u16 acked; + struct tipc_link *bc_rcvlink; + struct tipc_link *bc_sndlink; + int nack_state; + bool bc_peer_is_up; + + /* Statistics */ + struct tipc_stats stats; +}; + /* * Error message prefixes */ static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -static const char tipc_bclink_name[] = "broadcast-link"; - -static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { - [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_LINK_NAME - }, - [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } -}; /* Properties valid for media, bearar and link */ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { @@ -117,8 +245,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq); -static void link_reset_statistics(struct tipc_link *l_ptr); -static void link_print(struct tipc_link *l_ptr, const char *str); +static void link_print(struct tipc_link *l, const char *str); static void tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, @@ -183,6 +310,36 @@ void tipc_link_set_active(struct tipc_link *l, bool active) l->active = active; } +u32 tipc_link_id(struct tipc_link *l) +{ + return l->peer_bearer_id << 16 | l->bearer_id; +} + +int tipc_link_window(struct tipc_link *l) +{ + return l->window; +} + +int tipc_link_prio(struct tipc_link *l) +{ + return l->priority; +} + +unsigned long tipc_link_tolerance(struct tipc_link *l) +{ + return l->tolerance; +} + +struct sk_buff_head *tipc_link_inputq(struct tipc_link *l) +{ + return l->inputq; +} + +char tipc_link_plane(struct tipc_link *l) +{ + return l->net_plane; +} + void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq) @@ -191,6 +348,7 @@ void tipc_link_add_bc_peer(struct tipc_link *snd_l, snd_l->ackers++; rcv_l->acked = snd_l->snd_nxt - 1; + snd_l->state = LINK_ESTABLISHED; tipc_link_build_bc_init_msg(uc_l, xmitq); } @@ -206,6 +364,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, rcv_l->state = LINK_RESET; if (!snd_l->ackers) { tipc_link_reset(snd_l); + snd_l->state = LINK_RESET; __skb_queue_purge(xmitq); } } @@ -225,11 +384,31 @@ int tipc_link_mtu(struct tipc_link *l) return l->mtu; } +u16 tipc_link_rcv_nxt(struct tipc_link *l) +{ + return l->rcv_nxt; +} + +u16 tipc_link_acked(struct tipc_link *l) +{ + return l->acked; +} + +char *tipc_link_name(struct tipc_link *l) +{ + return l->name; +} + static u32 link_own_addr(struct tipc_link *l) { return msg_prevnode(l->pmsg); } +void tipc_link_reinit(struct tipc_link *l, u32 addr) +{ + msg_set_prevnode(l->pmsg, addr); +} + /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -692,7 +871,7 @@ void tipc_link_reset(struct tipc_link *l) l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; - link_reset_statistics(l); + tipc_link_reset_stats(l); } /** @@ -1085,8 +1264,9 @@ drop: /* * Send protocol message to the other endpoint. */ -void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, - u32 gap, u32 tolerance, u32 priority) +static void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, + int probe_msg, u32 gap, u32 tolerance, + u32 priority) { struct sk_buff *skb = NULL; struct sk_buff_head xmitq; @@ -1260,6 +1440,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* fall thru' */ case ACTIVATE_MSG: + skb_linearize(skb); + hdr = buf_msg(skb); /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; @@ -1525,53 +1707,17 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } -/* tipc_link_find_owner - locate owner node of link by link's name - * @net: the applicable net namespace - * @name: pointer to link name string - * @bearer_id: pointer to index in 'node->links' array where the link was found. - * - * Returns pointer to node owning the link, or 0 if no matching link is found. - */ -static struct tipc_node *tipc_link_find_owner(struct net *net, - const char *link_name, - unsigned int *bearer_id) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_node *n_ptr; - struct tipc_node *found_node = NULL; - int i; - - *bearer_id = 0; - rcu_read_lock(); - list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { - tipc_node_lock(n_ptr); - for (i = 0; i < MAX_BEARERS; i++) { - l_ptr = n_ptr->links[i].link; - if (l_ptr && !strcmp(l_ptr->name, link_name)) { - *bearer_id = i; - found_node = n_ptr; - break; - } - } - tipc_node_unlock(n_ptr); - if (found_node) - break; - } - rcu_read_unlock(); - - return found_node; -} - /** - * link_reset_statistics - reset link statistics - * @l_ptr: pointer to link + * link_reset_stats - reset link statistics + * @l: pointer to link */ -static void link_reset_statistics(struct tipc_link *l_ptr) +void tipc_link_reset_stats(struct tipc_link *l) { - memset(&l_ptr->stats, 0, sizeof(l_ptr->stats)); - l_ptr->stats.sent_info = l_ptr->snd_nxt; - l_ptr->stats.recv_info = l_ptr->rcv_nxt; + memset(&l->stats, 0, sizeof(l->stats)); + if (!link_is_bc_sndlink(l)) { + l->stats.sent_info = l->snd_nxt; + l->stats.recv_info = l->rcv_nxt; + } } static void link_print(struct tipc_link *l, const char *str) @@ -1624,84 +1770,6 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) return 0; } -int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) -{ - int err; - int res = 0; - int bearer_id; - char *name; - struct tipc_link *link; - struct tipc_node *node; - struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; - struct net *net = sock_net(skb->sk); - - if (!info->attrs[TIPC_NLA_LINK]) - return -EINVAL; - - err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, - info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy); - if (err) - return err; - - if (!attrs[TIPC_NLA_LINK_NAME]) - return -EINVAL; - - name = nla_data(attrs[TIPC_NLA_LINK_NAME]); - - if (strcmp(name, tipc_bclink_name) == 0) - return tipc_nl_bc_link_set(net, attrs); - - node = tipc_link_find_owner(net, name, &bearer_id); - if (!node) - return -EINVAL; - - tipc_node_lock(node); - - link = node->links[bearer_id].link; - if (!link) { - res = -EINVAL; - goto out; - } - - if (attrs[TIPC_NLA_LINK_PROP]) { - struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; - - err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], - props); - if (err) { - res = err; - goto out; - } - - if (props[TIPC_NLA_PROP_TOL]) { - u32 tol; - - tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - link->tolerance = tol; - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); - } - if (props[TIPC_NLA_PROP_PRIO]) { - u32 prio; - - prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); - link->priority = prio; - tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio); - } - if (props[TIPC_NLA_PROP_WIN]) { - u32 win; - - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); - } - } - -out: - tipc_node_unlock(node); - - return res; -} - static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) { int i; @@ -1768,8 +1836,8 @@ msg_full: } /* Caller should hold appropriate locks to protect the link */ -static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, - struct tipc_link *link, int nlflags) +int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *link, int nlflags) { int err; void *hdr; @@ -1838,198 +1906,134 @@ msg_full: return -EMSGSIZE; } -/* Caller should hold node lock */ -static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, - struct tipc_node *node, u32 *prev_link) +static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, + struct tipc_stats *stats) { - u32 i; - int err; - - for (i = *prev_link; i < MAX_BEARERS; i++) { - *prev_link = i; - - if (!node->links[i].link) - continue; + int i; + struct nlattr *nest; - err = __tipc_nl_add_link(net, msg, - node->links[i].link, NLM_F_MULTI); - if (err) - return err; - } - *prev_link = 0; + struct nla_map { + __u32 key; + __u32 val; + }; - return 0; -} + struct nla_map map[] = { + {TIPC_NLA_STATS_RX_INFO, stats->recv_info}, + {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments}, + {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented}, + {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles}, + {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled}, + {TIPC_NLA_STATS_TX_INFO, stats->sent_info}, + {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments}, + {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented}, + {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles}, + {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled}, + {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks}, + {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv}, + {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks}, + {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks}, + {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted}, + {TIPC_NLA_STATS_DUPLICATES, stats->duplicates}, + {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs}, + {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz}, + {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ? + (stats->accu_queue_sz / stats->queue_sz_counts) : 0} + }; -int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *node; - struct tipc_nl_msg msg; - u32 prev_node = cb->args[0]; - u32 prev_link = cb->args[1]; - int done = cb->args[2]; - int err; + nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS); + if (!nest) + return -EMSGSIZE; - if (done) - return 0; + for (i = 0; i < ARRAY_SIZE(map); i++) + if (nla_put_u32(skb, map[i].key, map[i].val)) + goto msg_full; - msg.skb = skb; - msg.portid = NETLINK_CB(cb->skb).portid; - msg.seq = cb->nlh->nlmsg_seq; - - rcu_read_lock(); - if (prev_node) { - node = tipc_node_find(net, prev_node); - if (!node) { - /* We never set seq or call nl_dump_check_consistent() - * this means that setting prev_seq here will cause the - * consistence check to fail in the netlink callback - * handler. Resulting in the last NLMSG_DONE message - * having the NLM_F_DUMP_INTR flag set. - */ - cb->prev_seq = 1; - goto out; - } - tipc_node_put(node); - - list_for_each_entry_continue_rcu(node, &tn->node_list, - list) { - tipc_node_lock(node); - err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); - tipc_node_unlock(node); - if (err) - goto out; - - prev_node = node->addr; - } - } else { - err = tipc_nl_add_bc_link(net, &msg); - if (err) - goto out; - - list_for_each_entry_rcu(node, &tn->node_list, list) { - tipc_node_lock(node); - err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); - tipc_node_unlock(node); - if (err) - goto out; - - prev_node = node->addr; - } - } - done = 1; -out: - rcu_read_unlock(); + nla_nest_end(skb, nest); - cb->args[0] = prev_node; - cb->args[1] = prev_link; - cb->args[2] = done; + return 0; +msg_full: + nla_nest_cancel(skb, nest); - return skb->len; + return -EMSGSIZE; } -int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) { - struct net *net = genl_info_net(info); - struct tipc_nl_msg msg; - char *name; int err; + void *hdr; + struct nlattr *attrs; + struct nlattr *prop; + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *bcl = tn->bcl; - msg.portid = info->snd_portid; - msg.seq = info->snd_seq; - - if (!info->attrs[TIPC_NLA_LINK_NAME]) - return -EINVAL; - name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); + if (!bcl) + return 0; - msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!msg.skb) - return -ENOMEM; + tipc_bcast_lock(net); - if (strcmp(name, tipc_bclink_name) == 0) { - err = tipc_nl_add_bc_link(net, &msg); - if (err) { - nlmsg_free(msg.skb); - return err; - } - } else { - int bearer_id; - struct tipc_node *node; - struct tipc_link *link; + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + NLM_F_MULTI, TIPC_NL_LINK_GET); + if (!hdr) + return -EMSGSIZE; - node = tipc_link_find_owner(net, name, &bearer_id); - if (!node) - return -EINVAL; + attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); + if (!attrs) + goto msg_full; - tipc_node_lock(node); - link = node->links[bearer_id].link; - if (!link) { - tipc_node_unlock(node); - nlmsg_free(msg.skb); - return -EINVAL; - } + /* The broadcast link is always up */ + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) + goto attr_msg_full; - err = __tipc_nl_add_link(net, &msg, link, 0); - tipc_node_unlock(node); - if (err) { - nlmsg_free(msg.skb); - return err; - } - } + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST)) + goto attr_msg_full; + if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt)) + goto attr_msg_full; - return genlmsg_reply(msg.skb, info); -} + prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); + if (!prop) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + goto prop_msg_full; + nla_nest_end(msg->skb, prop); -int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) -{ - int err; - char *link_name; - unsigned int bearer_id; - struct tipc_link *link; - struct tipc_node *node; - struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; - struct net *net = sock_net(skb->sk); - - if (!info->attrs[TIPC_NLA_LINK]) - return -EINVAL; - - err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, - info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy); + err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats); if (err) - return err; - - if (!attrs[TIPC_NLA_LINK_NAME]) - return -EINVAL; - - link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + goto attr_msg_full; - if (strcmp(link_name, tipc_bclink_name) == 0) { - err = tipc_bclink_reset_stats(net); - if (err) - return err; - return 0; - } + tipc_bcast_unlock(net); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); - node = tipc_link_find_owner(net, link_name, &bearer_id); - if (!node) - return -EINVAL; + return 0; - tipc_node_lock(node); +prop_msg_full: + nla_nest_cancel(msg->skb, prop); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + tipc_bcast_unlock(net); + genlmsg_cancel(msg->skb, hdr); - link = node->links[bearer_id].link; - if (!link) { - tipc_node_unlock(node); - return -EINVAL; - } + return -EMSGSIZE; +} - link_reset_statistics(link); +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol) +{ + l->tolerance = tol; + tipc_link_proto_xmit(l, STATE_MSG, 0, 0, tol, 0); +} - tipc_node_unlock(node); +void tipc_link_set_prio(struct tipc_link *l, u32 prio) +{ + l->priority = prio; + tipc_link_proto_xmit(l, STATE_MSG, 0, 0, 0, prio); +} - return 0; +void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) +{ + l->abort_limit = limit; } diff --git a/net/tipc/link.h b/net/tipc/link.h index 66d859b66c84..b2ae0f4276af 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -45,10 +45,6 @@ */ #define ELINKCONG EAGAIN /* link congestion <=> resource unavailable */ -/* Out-of-range value for link sequence numbers - */ -#define INVALID_LINK_SEQ 0x10000 - /* Link FSM events: */ enum { @@ -75,151 +71,6 @@ enum { */ #define MAX_PKT_DEFAULT 1500 -struct tipc_stats { - u32 sent_info; /* used in counting # sent packets */ - u32 recv_info; /* used in counting # recv'd packets */ - u32 sent_states; - u32 recv_states; - u32 sent_probes; - u32 recv_probes; - u32 sent_nacks; - u32 recv_nacks; - u32 sent_acks; - u32 sent_bundled; - u32 sent_bundles; - u32 recv_bundled; - u32 recv_bundles; - u32 retransmitted; - u32 sent_fragmented; - u32 sent_fragments; - u32 recv_fragmented; - u32 recv_fragments; - u32 link_congs; /* # port sends blocked by congestion */ - u32 deferred_recv; - u32 duplicates; - u32 max_queue_sz; /* send queue size high water mark */ - u32 accu_queue_sz; /* used for send queue size profiling */ - u32 queue_sz_counts; /* used for send queue size profiling */ - u32 msg_length_counts; /* used for message length profiling */ - u32 msg_lengths_total; /* used for message length profiling */ - u32 msg_length_profile[7]; /* used for msg. length profiling */ -}; - -/** - * struct tipc_link - TIPC link data structure - * @addr: network address of link's peer node - * @name: link name character string - * @media_addr: media address to use when sending messages over link - * @timer: link timer - * @net: pointer to namespace struct - * @refcnt: reference counter for permanent references (owner node & timer) - * @peer_session: link session # being used by peer end of link - * @peer_bearer_id: bearer id used by link's peer endpoint - * @bearer_id: local bearer id used by link - * @tolerance: minimum link continuity loss needed to reset link [in ms] - * @keepalive_intv: link keepalive timer interval - * @abort_limit: # of unacknowledged continuity probes needed to reset link - * @state: current state of link FSM - * @peer_caps: bitmap describing capabilities of peer node - * @silent_intv_cnt: # of timer intervals without any reception from peer - * @proto_msg: template for control messages generated by link - * @pmsg: convenience pointer to "proto_msg" field - * @priority: current link priority - * @net_plane: current link network plane ('A' through 'H') - * @backlog_limit: backlog queue congestion thresholds (indexed by importance) - * @exp_msg_count: # of tunnelled messages expected during link changeover - * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset - * @mtu: current maximum packet size for this link - * @advertised_mtu: advertised own mtu when link is being established - * @transmitq: queue for sent, non-acked messages - * @backlogq: queue for messages waiting to be sent - * @snt_nxt: next sequence number to use for outbound messages - * @last_retransmitted: sequence number of most recently retransmitted message - * @stale_count: # of identical retransmit requests made by peer - * @ackers: # of peers that needs to ack each packet before it can be released - * @acked: # last packet acked by a certain peer. Used for broadcast. - * @rcv_nxt: next sequence number to expect for inbound messages - * @deferred_queue: deferred queue saved OOS b'cast message received from node - * @unacked_window: # of inbound messages rx'd without ack'ing back to peer - * @inputq: buffer queue for messages to be delivered upwards - * @namedq: buffer queue for name table messages to be delivered upwards - * @next_out: ptr to first unsent outbound message in queue - * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate - * @long_msg_seq_no: next identifier to use for outbound fragmented messages - * @reasm_buf: head of partially reassembled inbound message fragments - * @bc_rcvr: marks that this is a broadcast receiver link - * @stats: collects statistics regarding link activity - */ -struct tipc_link { - u32 addr; - char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr *media_addr; - struct net *net; - - /* Management and link supervision data */ - u32 peer_session; - u32 peer_bearer_id; - u32 bearer_id; - u32 tolerance; - unsigned long keepalive_intv; - u32 abort_limit; - u32 state; - u16 peer_caps; - bool active; - u32 silent_intv_cnt; - struct { - unchar hdr[INT_H_SIZE]; - unchar body[TIPC_MAX_IF_NAME]; - } proto_msg; - struct tipc_msg *pmsg; - u32 priority; - char net_plane; - - /* Failover/synch */ - u16 drop_point; - struct sk_buff *failover_reasm_skb; - - /* Max packet negotiation */ - u16 mtu; - u16 advertised_mtu; - - /* Sending */ - struct sk_buff_head transmq; - struct sk_buff_head backlogq; - struct { - u16 len; - u16 limit; - } backlog[5]; - u16 snd_nxt; - u16 last_retransm; - u16 window; - u32 stale_count; - - /* Reception */ - u16 rcv_nxt; - u32 rcv_unacked; - struct sk_buff_head deferdq; - struct sk_buff_head *inputq; - struct sk_buff_head *namedq; - - /* Congestion handling */ - struct sk_buff_head wakeupq; - - /* Fragmentation/reassembly */ - struct sk_buff *reasm_buf; - - /* Broadcast */ - u16 ackers; - u16 acked; - struct tipc_link *bc_rcvlink; - struct tipc_link *bc_sndlink; - int nack_state; - bool bc_peer_is_up; - - /* Statistics */ - struct tipc_stats stats; -}; - bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, int window, u32 session, u32 ownnode, u32 peer, @@ -235,11 +86,11 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, struct tipc_link **link); +void tipc_link_reinit(struct tipc_link *l, u32 addr); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); -void tipc_link_reset_fragments(struct tipc_link *l_ptr); bool tipc_link_is_up(struct tipc_link *l); bool tipc_link_peer_is_down(struct tipc_link *l); bool tipc_link_is_reset(struct tipc_link *l); @@ -248,15 +99,25 @@ bool tipc_link_is_synching(struct tipc_link *l); bool tipc_link_is_failingover(struct tipc_link *l); bool tipc_link_is_blocked(struct tipc_link *l); void tipc_link_set_active(struct tipc_link *l, bool active); -void tipc_link_reset(struct tipc_link *l_ptr); -int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, +void tipc_link_reset(struct tipc_link *l); +void tipc_link_reset_stats(struct tipc_link *l); +int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, struct sk_buff_head *xmitq); +struct sk_buff_head *tipc_link_inputq(struct tipc_link *l); +u16 tipc_link_rcv_nxt(struct tipc_link *l); +u16 tipc_link_acked(struct tipc_link *l); +u32 tipc_link_id(struct tipc_link *l); +char *tipc_link_name(struct tipc_link *l); +char tipc_link_plane(struct tipc_link *l); +int tipc_link_prio(struct tipc_link *l); +int tipc_link_window(struct tipc_link *l); +unsigned long tipc_link_tolerance(struct tipc_link *l); +void tipc_link_set_tolerance(struct tipc_link *l, u32 tol); +void tipc_link_set_prio(struct tipc_link *l, u32 prio); +void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); - -int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb); -int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); -int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); -int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index c07612bab95c..ebe9d0ff6e9e 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -84,31 +84,6 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, return buf; } -void named_cluster_distribute(struct net *net, struct sk_buff *skb) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff *oskb; - struct tipc_node *node; - u32 dnode; - - rcu_read_lock(); - list_for_each_entry_rcu(node, &tn->node_list, list) { - dnode = node->addr; - if (in_own_node(net, dnode)) - continue; - if (!tipc_node_is_up(node)) - continue; - oskb = pskb_copy(skb, GFP_ATOMIC); - if (!oskb) - break; - msg_set_destnode(buf_msg(oskb), dnode); - tipc_node_xmit_skb(net, oskb, dnode, 0); - } - rcu_read_unlock(); - - kfree_skb(skb); -} - /** * tipc_named_publish - tell other nodes about a new publication by this node */ @@ -226,42 +201,6 @@ void tipc_named_node_up(struct net *net, u32 dnode) tipc_node_xmit(net, &head, dnode, 0); } -static void tipc_publ_subscribe(struct net *net, struct publication *publ, - u32 addr) -{ - struct tipc_node *node; - - if (in_own_node(net, addr)) - return; - - node = tipc_node_find(net, addr); - if (!node) { - pr_warn("Node subscription rejected, unknown node 0x%x\n", - addr); - return; - } - - tipc_node_lock(node); - list_add_tail(&publ->nodesub_list, &node->publ_list); - tipc_node_unlock(node); - tipc_node_put(node); -} - -static void tipc_publ_unsubscribe(struct net *net, struct publication *publ, - u32 addr) -{ - struct tipc_node *node; - - node = tipc_node_find(net, addr); - if (!node) - return; - - tipc_node_lock(node); - list_del_init(&publ->nodesub_list); - tipc_node_unlock(node); - tipc_node_put(node); -} - /** * tipc_publ_purge - remove publication associated with a failed node * @@ -277,7 +216,7 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->node, publ->ref, publ->key); if (p) - tipc_publ_unsubscribe(net, p, addr); + tipc_node_unsubscribe(net, &p->nodesub_list, addr); spin_unlock_bh(&tn->nametbl_lock); if (p != publ) { @@ -317,7 +256,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, TIPC_CLUSTER_SCOPE, node, ntohl(i->ref), ntohl(i->key)); if (publ) { - tipc_publ_subscribe(net, publ, node); + tipc_node_subscribe(net, &publ->nodesub_list, node); return true; } } else if (dtype == WITHDRAWAL) { @@ -326,7 +265,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, node, ntohl(i->ref), ntohl(i->key)); if (publ) { - tipc_publ_unsubscribe(net, publ, node); + tipc_node_unsubscribe(net, &publ->nodesub_list, node); kfree_rcu(publ, rcu); return true; } @@ -397,6 +336,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) spin_lock_bh(&tn->nametbl_lock); for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) { + skb_linearize(skb); msg = buf_msg(skb); mtype = msg_type(msg); item = (struct distr_item *)msg_data(msg); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index dd2d9fd80da2..1264ba0af937 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -69,7 +69,6 @@ struct distr_item { struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); -void named_cluster_distribute(struct net *net, struct sk_buff *buf); void tipc_named_node_up(struct net *net, u32 dnode); void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue); void tipc_named_reinit(struct net *net); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 0f47f08bf38f..91fce70291a8 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -42,6 +42,7 @@ #include "subscr.h" #include "bcast.h" #include "addr.h" +#include "node.h" #include <net/genetlink.h> #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ @@ -677,7 +678,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, spin_unlock_bh(&tn->nametbl_lock); if (buf) - named_cluster_distribute(net, buf); + tipc_node_broadcast(net, buf); return publ; } @@ -709,7 +710,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, spin_unlock_bh(&tn->nametbl_lock); if (skb) { - named_cluster_distribute(net, skb); + tipc_node_broadcast(net, skb); return 1; } return 0; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 7f6475efc984..8975b0135b76 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -101,18 +101,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_LINK_GET, - .doit = tipc_nl_link_get, - .dumpit = tipc_nl_link_dump, + .doit = tipc_nl_node_get_link, + .dumpit = tipc_nl_node_dump_link, .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_LINK_SET, - .doit = tipc_nl_link_set, + .doit = tipc_nl_node_set_link, .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_LINK_RESET_STATS, - .doit = tipc_nl_link_reset_stats, + .doit = tipc_nl_node_reset_link_stats, .policy = tipc_nl_policy, }, { diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 1eadc95e1132..2c016fdefe97 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1023,25 +1023,25 @@ static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg) msg->req_type = TIPC_TLV_LINK_NAME; msg->rep_size = ULTRA_STRING_MAX_LEN; msg->rep_type = TIPC_TLV_ULTRA_STRING; - dump.dumpit = tipc_nl_link_dump; + dump.dumpit = tipc_nl_node_dump_link; dump.format = tipc_nl_compat_link_stat_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_GET_LINKS: msg->req_type = TIPC_TLV_NET_ADDR; msg->rep_size = ULTRA_STRING_MAX_LEN; - dump.dumpit = tipc_nl_link_dump; + dump.dumpit = tipc_nl_node_dump_link; dump.format = tipc_nl_compat_link_dump; return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SET_LINK_TOL: case TIPC_CMD_SET_LINK_PRI: case TIPC_CMD_SET_LINK_WINDOW: msg->req_type = TIPC_TLV_LINK_CONFIG; - doit.doit = tipc_nl_link_set; + doit.doit = tipc_nl_node_set_link; doit.transcode = tipc_nl_compat_link_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_RESET_LINK_STATS: msg->req_type = TIPC_TLV_LINK_NAME; - doit.doit = tipc_nl_link_reset_stats; + doit.doit = tipc_nl_node_reset_link_stats; doit.transcode = tipc_nl_compat_link_reset_stats; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SHOW_NAME_TABLE: diff --git a/net/tipc/node.c b/net/tipc/node.c index 20cddec0a43c..fa97d9649a28 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -42,6 +42,84 @@ #include "bcast.h" #include "discover.h" +#define INVALID_NODE_SIG 0x10000 + +/* Flags used to take different actions according to flag type + * TIPC_NOTIFY_NODE_DOWN: notify node is down + * TIPC_NOTIFY_NODE_UP: notify node is up + * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type + */ +enum { + TIPC_NOTIFY_NODE_DOWN = (1 << 3), + TIPC_NOTIFY_NODE_UP = (1 << 4), + TIPC_NOTIFY_LINK_UP = (1 << 6), + TIPC_NOTIFY_LINK_DOWN = (1 << 7) +}; + +struct tipc_link_entry { + struct tipc_link *link; + spinlock_t lock; /* per link */ + u32 mtu; + struct sk_buff_head inputq; + struct tipc_media_addr maddr; +}; + +struct tipc_bclink_entry { + struct tipc_link *link; + struct sk_buff_head inputq1; + struct sk_buff_head arrvq; + struct sk_buff_head inputq2; + struct sk_buff_head namedq; +}; + +/** + * struct tipc_node - TIPC node structure + * @addr: network address of node + * @ref: reference counter to node object + * @lock: rwlock governing access to structure + * @net: the applicable net namespace + * @hash: links to adjacent nodes in unsorted hash chain + * @inputq: pointer to input queue containing messages for msg event + * @namedq: pointer to name table input queue with name table messages + * @active_links: bearer ids of active links, used as index into links[] array + * @links: array containing references to all links to node + * @action_flags: bit mask of different types of node actions + * @state: connectivity state vs peer node + * @sync_point: sequence number where synch/failover is finished + * @list: links to adjacent nodes in sorted list of cluster's nodes + * @working_links: number of working links to node (both active and standby) + * @link_cnt: number of links to node + * @capabilities: bitmap, indicating peer node's functional capabilities + * @signature: node instance identifier + * @link_id: local and remote bearer ids of changing link, if any + * @publ_list: list of publications + * @rcu: rcu struct for tipc_node + */ +struct tipc_node { + u32 addr; + struct kref kref; + rwlock_t lock; + struct net *net; + struct hlist_node hash; + int active_links[2]; + struct tipc_link_entry links[MAX_BEARERS]; + struct tipc_bclink_entry bc_entry; + int action_flags; + struct list_head list; + int state; + u16 sync_point; + int link_cnt; + u16 working_links; + u16 capabilities; + u32 signature; + u32 link_id; + struct list_head publ_list; + struct list_head conn_sks; + unsigned long keepalive_intv; + struct timer_list timer; + struct rcu_head rcu; +}; + /* Node FSM states and events: */ enum { @@ -75,6 +153,9 @@ static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(unsigned long data); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); +static struct tipc_node *tipc_node_find(struct net *net, u32 addr); +static void tipc_node_put(struct tipc_node *node); +static bool tipc_node_is_up(struct tipc_node *n); struct tipc_sock_conn { u32 port; @@ -83,12 +164,54 @@ struct tipc_sock_conn { struct list_head list; }; +static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_LINK_NAME] = { + .type = NLA_STRING, + .len = TIPC_MAX_LINK_NAME + }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } +}; + static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } }; +static struct tipc_link *node_active_link(struct tipc_node *n, int sel) +{ + int bearer_id = n->active_links[sel & 1]; + + if (unlikely(bearer_id == INVALID_BEARER_ID)) + return NULL; + + return n->links[bearer_id].link; +} + +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) +{ + struct tipc_node *n; + int bearer_id; + unsigned int mtu = MAX_MSG_SIZE; + + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return mtu; + + bearer_id = n->active_links[sel & 1]; + if (likely(bearer_id != INVALID_BEARER_ID)) + mtu = n->links[bearer_id].mtu; + tipc_node_put(n); + return mtu; +} /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -107,7 +230,7 @@ static void tipc_node_kref_release(struct kref *kref) tipc_node_delete(node); } -void tipc_node_put(struct tipc_node *node) +static void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } @@ -120,7 +243,7 @@ static void tipc_node_get(struct tipc_node *node) /* * tipc_node_find - locate specified node object, if it exists */ -struct tipc_node *tipc_node_find(struct net *net, u32 addr) +static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; @@ -141,66 +264,122 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) return NULL; } +static void tipc_node_read_lock(struct tipc_node *n) +{ + read_lock_bh(&n->lock); +} + +static void tipc_node_read_unlock(struct tipc_node *n) +{ + read_unlock_bh(&n->lock); +} + +static void tipc_node_write_lock(struct tipc_node *n) +{ + write_lock_bh(&n->lock); +} + +static void tipc_node_write_unlock(struct tipc_node *n) +{ + struct net *net = n->net; + u32 addr = 0; + u32 flags = n->action_flags; + u32 link_id = 0; + struct list_head *publ_list; + + if (likely(!flags)) { + write_unlock_bh(&n->lock); + return; + } + + addr = n->addr; + link_id = n->link_id; + publ_list = &n->publ_list; + + n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | + TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); + + write_unlock_bh(&n->lock); + + if (flags & TIPC_NOTIFY_NODE_DOWN) + tipc_publ_notify(net, publ_list, addr); + + if (flags & TIPC_NOTIFY_NODE_UP) + tipc_named_node_up(net, addr); + + if (flags & TIPC_NOTIFY_LINK_UP) + tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, + TIPC_NODE_SCOPE, link_id, addr); + + if (flags & TIPC_NOTIFY_LINK_DOWN) + tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, + link_id, addr); +} + struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *n_ptr, *temp_node; + struct tipc_node *n, *temp_node; + int i; spin_lock_bh(&tn->node_list_lock); - n_ptr = tipc_node_find(net, addr); - if (n_ptr) + n = tipc_node_find(net, addr); + if (n) goto exit; - n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC); - if (!n_ptr) { + n = kzalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } - n_ptr->addr = addr; - n_ptr->net = net; - n_ptr->capabilities = capabilities; - kref_init(&n_ptr->kref); - spin_lock_init(&n_ptr->lock); - INIT_HLIST_NODE(&n_ptr->hash); - INIT_LIST_HEAD(&n_ptr->list); - INIT_LIST_HEAD(&n_ptr->publ_list); - INIT_LIST_HEAD(&n_ptr->conn_sks); - skb_queue_head_init(&n_ptr->bc_entry.namedq); - skb_queue_head_init(&n_ptr->bc_entry.inputq1); - __skb_queue_head_init(&n_ptr->bc_entry.arrvq); - skb_queue_head_init(&n_ptr->bc_entry.inputq2); - hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); + n->addr = addr; + n->net = net; + n->capabilities = capabilities; + kref_init(&n->kref); + rwlock_init(&n->lock); + INIT_HLIST_NODE(&n->hash); + INIT_LIST_HEAD(&n->list); + INIT_LIST_HEAD(&n->publ_list); + INIT_LIST_HEAD(&n->conn_sks); + skb_queue_head_init(&n->bc_entry.namedq); + skb_queue_head_init(&n->bc_entry.inputq1); + __skb_queue_head_init(&n->bc_entry.arrvq); + skb_queue_head_init(&n->bc_entry.inputq2); + for (i = 0; i < MAX_BEARERS; i++) + spin_lock_init(&n->links[i].lock); + hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { - if (n_ptr->addr < temp_node->addr) + if (n->addr < temp_node->addr) break; } - list_add_tail_rcu(&n_ptr->list, &temp_node->list); - n_ptr->state = SELF_DOWN_PEER_LEAVING; - n_ptr->signature = INVALID_NODE_SIG; - n_ptr->active_links[0] = INVALID_BEARER_ID; - n_ptr->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), n_ptr->addr, - U16_MAX, tipc_bc_sndlink(net)->window, - n_ptr->capabilities, - &n_ptr->bc_entry.inputq1, - &n_ptr->bc_entry.namedq, + list_add_tail_rcu(&n->list, &temp_node->list); + n->state = SELF_DOWN_PEER_LEAVING; + n->signature = INVALID_NODE_SIG; + n->active_links[0] = INVALID_BEARER_ID; + n->active_links[1] = INVALID_BEARER_ID; + if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr, + U16_MAX, + tipc_link_window(tipc_bc_sndlink(net)), + n->capabilities, + &n->bc_entry.inputq1, + &n->bc_entry.namedq, tipc_bc_sndlink(net), - &n_ptr->bc_entry.link)) { + &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); - kfree(n_ptr); - n_ptr = NULL; + kfree(n); + n = NULL; goto exit; } - tipc_node_get(n_ptr); - setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr); - n_ptr->keepalive_intv = U32_MAX; + tipc_node_get(n); + setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); + n->keepalive_intv = U32_MAX; exit: spin_unlock_bh(&tn->node_list_lock); - return n_ptr; + return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { - unsigned long tol = l->tolerance; + unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; unsigned long keepalive_intv = msecs_to_jiffies(intv); @@ -209,7 +388,7 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) n->keepalive_intv = keepalive_intv; /* Ensure link's abort limit corresponds to current interval */ - l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv); + tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv)); } static void tipc_node_delete(struct tipc_node *node) @@ -234,6 +413,42 @@ void tipc_node_stop(struct net *net) spin_unlock_bh(&tn->node_list_lock); } +void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) +{ + struct tipc_node *n; + + if (in_own_node(net, addr)) + return; + + n = tipc_node_find(net, addr); + if (!n) { + pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); + return; + } + tipc_node_write_lock(n); + list_add_tail(subscr, &n->publ_list); + tipc_node_write_unlock(n); + tipc_node_put(n); +} + +void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) +{ + struct tipc_node *n; + + if (in_own_node(net, addr)) + return; + + n = tipc_node_find(net, addr); + if (!n) { + pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); + return; + } + tipc_node_write_lock(n); + list_del_init(subscr); + tipc_node_write_unlock(n); + tipc_node_put(n); +} + int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; @@ -257,9 +472,9 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) conn->port = port; conn->peer_port = peer_port; - tipc_node_lock(node); + tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); - tipc_node_unlock(node); + tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; @@ -277,14 +492,14 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) if (!node) return; - tipc_node_lock(node); + tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } - tipc_node_unlock(node); + tipc_node_write_unlock(node); tipc_node_put(node); } @@ -301,14 +516,16 @@ static void tipc_node_timeout(unsigned long data) __skb_queue_head_init(&xmitq); for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { - tipc_node_lock(n); + tipc_node_read_lock(n); le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); if (le->link) { /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); } - tipc_node_unlock(n); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); @@ -340,16 +557,16 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; - n->link_id = nl->peer_bearer_id << 16 | bearer_id; + n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE; + n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE; tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", - nl->name, nl->net_plane); + tipc_link_name(nl), tipc_link_plane(nl)); /* First link? => give it both slots */ if (!ol) { @@ -362,17 +579,17 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, } /* Second link => redistribute slots */ - if (nl->priority > ol->priority) { - pr_debug("Old link <%s> becomes standby\n", ol->name); + if (tipc_link_prio(nl) > tipc_link_prio(ol)) { + pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); - } else if (nl->priority == ol->priority) { + } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { - pr_debug("New link <%s> is standby\n", nl->name); + pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ @@ -387,9 +604,9 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { - tipc_node_lock(n); + tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); - tipc_node_unlock(n); + tipc_node_write_unlock(n); } /** @@ -402,7 +619,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; - int i, highest = 0; + int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; @@ -411,12 +628,12 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n->link_id = l->peer_bearer_id << 16 | *bearer_id; + n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", - l->name, l->net_plane); + tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; @@ -427,10 +644,11 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, continue; if (_l == l) continue; - if (_l->priority < highest) + prio = tipc_link_prio(_l); + if (prio < highest) continue; - if (_l->priority > highest) { - highest = _l->priority; + if (prio > highest) { + highest = prio; *slot0 = i; *slot1 = i; continue; @@ -453,17 +671,17 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ - tnl = node_active_link(n, 0); + *bearer_id = n->active_links[0]; + tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); - n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); + n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); - *maddr = &n->links[tnl->bearer_id].maddr; - *bearer_id = tnl->bearer_id; + *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) @@ -478,7 +696,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) __skb_queue_head_init(&xmitq); - tipc_node_lock(n); + tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); if (delete) { @@ -490,12 +708,12 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) /* Defuse pending tipc_node_link_up() */ tipc_link_fsm_evt(l, LINK_RESET_EVT); } - tipc_node_unlock(n); + tipc_node_write_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } -bool tipc_node_is_up(struct tipc_node *n) +static bool tipc_node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } @@ -523,7 +741,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, if (!n) return; - tipc_node_lock(n); + tipc_node_write_lock(n); le = &n->links[b->identity]; @@ -626,7 +844,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: - tipc_node_unlock(n); + tipc_node_write_unlock(n); if (reset && !tipc_link_is_reset(l)) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); @@ -834,24 +1052,6 @@ illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); } -bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr) -{ - int state = n->state; - - if (likely(state == SELF_UP_PEER_UP)) - return true; - - if (state == SELF_LEAVING_PEER_DOWN) - return false; - - if (state == SELF_DOWN_PEER_LEAVING) { - if (msg_peer_node_is_up(hdr)) - return false; - } - - return true; -} - static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { @@ -913,56 +1113,18 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, if (bearer_id >= MAX_BEARERS) goto exit; - tipc_node_lock(node); + tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { - strncpy(linkname, link->name, len); + strncpy(linkname, tipc_link_name(link), len); err = 0; } exit: - tipc_node_unlock(node); + tipc_node_read_unlock(node); tipc_node_put(node); return err; } -void tipc_node_unlock(struct tipc_node *node) -{ - struct net *net = node->net; - u32 addr = 0; - u32 flags = node->action_flags; - u32 link_id = 0; - struct list_head *publ_list; - - if (likely(!flags)) { - spin_unlock_bh(&node->lock); - return; - } - - addr = node->addr; - link_id = node->link_id; - publ_list = &node->publ_list; - - node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | - TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); - - spin_unlock_bh(&node->lock); - - if (flags & TIPC_NOTIFY_NODE_DOWN) - tipc_publ_notify(net, publ_list, addr); - - if (flags & TIPC_NOTIFY_NODE_UP) - tipc_named_node_up(net, addr); - - if (flags & TIPC_NOTIFY_LINK_UP) - tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, - TIPC_NODE_SCOPE, link_id, addr); - - if (flags & TIPC_NOTIFY_LINK_DOWN) - tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, - link_id, addr); - -} - /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { @@ -997,20 +1159,6 @@ msg_full: return -EMSGSIZE; } -static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel, - int *bearer_id, - struct tipc_media_addr **maddr) -{ - int id = n->active_links[sel & 1]; - - if (unlikely(id < 0)) - return NULL; - - *bearer_id = id; - *maddr = &n->links[id].maddr; - return n->links[id].link; -} - /** * tipc_node_xmit() is the general link level function for message sending * @net: the applicable net namespace @@ -1023,29 +1171,32 @@ static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel, int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { - struct tipc_link *l = NULL; + struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; - struct tipc_media_addr *maddr; - int bearer_id; + int bearer_id = -1; int rc = -EHOSTUNREACH; __skb_queue_head_init(&xmitq); n = tipc_node_find(net, dnode); if (likely(n)) { - tipc_node_lock(n); - l = tipc_node_select_link(n, selector, &bearer_id, &maddr); - if (likely(l)) - rc = tipc_link_xmit(l, list, &xmitq); - tipc_node_unlock(n); - if (unlikely(rc == -ENOBUFS)) + tipc_node_read_lock(n); + bearer_id = n->active_links[selector & 1]; + if (bearer_id >= 0) { + le = &n->links[bearer_id]; + spin_lock_bh(&le->lock); + rc = tipc_link_xmit(le->link, list, &xmitq); + spin_unlock_bh(&le->lock); + } + tipc_node_read_unlock(n); + if (likely(!rc)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + else if (rc == -ENOBUFS) tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); + return rc; } - if (likely(!rc)) { - tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); - return 0; - } + if (likely(in_own_node(net, dnode))) { tipc_sk_rcv(net, list); return 0; @@ -1075,6 +1226,30 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +void tipc_node_broadcast(struct net *net, struct sk_buff *skb) +{ + struct sk_buff *txskb; + struct tipc_node *n; + u32 dst; + + rcu_read_lock(); + list_for_each_entry_rcu(n, tipc_nodes(net), list) { + dst = n->addr; + if (in_own_node(net, dst)) + continue; + if (!tipc_node_is_up(n)) + continue; + txskb = pskb_copy(skb, GFP_ATOMIC); + if (!txskb) + break; + msg_set_destnode(buf_msg(txskb), dst); + tipc_node_xmit_skb(net, txskb, dst, 0); + } + rcu_read_unlock(); + + kfree_skb(skb); +} + /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace @@ -1116,9 +1291,9 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_BC_ACK) { - tipc_node_lock(n); + tipc_node_read_lock(n); tipc_link_build_ack_msg(le->link, &xmitq); - tipc_node_unlock(n); + tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) @@ -1151,30 +1326,30 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, u16 oseqno = msg_seqno(hdr); u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); u16 exp_pkts = msg_msgcnt(hdr); - u16 rcv_nxt, syncpt, dlv_nxt; + u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; - int i, pb_id; + int pb_id; l = n->links[bearer_id].link; if (!l) return false; - rcv_nxt = l->rcv_nxt; + rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ - for (i = 0; i < MAX_BEARERS; i++) { - if ((i != bearer_id) && n->links[i].link) { - pl = n->links[i].link; + for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { + if ((pb_id != bearer_id) && n->links[pb_id].link) { + pl = n->links[pb_id].link; break; } } - /* Update node accesibility if applicable */ + /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; @@ -1187,8 +1362,12 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + return true; } + if (state == SELF_LEAVING_PEER_DOWN) + return false; + /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; @@ -1197,9 +1376,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && tipc_link_is_up(pl)) { - pb_id = pl->bearer_id; __tipc_node_link_down(n, &pb_id, xmitq, &maddr); - tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq); + tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), + tipc_link_inputq(l)); } /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) @@ -1232,19 +1411,18 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } - if (less(syncpt, n->sync_point)) - n->sync_point = syncpt; } /* Open tunnel link when parallel link reaches synch point */ - if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) { + if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } - dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq)); + inputq_len = skb_queue_len(tipc_link_inputq(pl)); + dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); @@ -1304,22 +1482,32 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr); - else if (unlikely(n->bc_entry.link->acked != bc_ack)) + else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack); - tipc_node_lock(n); - - /* Is reception permitted at the moment ? */ - if (!tipc_node_filter_pkt(n, hdr)) - goto unlock; - - /* Check and if necessary update node state */ - if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) { - rc = tipc_link_rcv(le->link, skb, &xmitq); - skb = NULL; + /* Receive packet directly if conditions permit */ + tipc_node_read_lock(n); + if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { + spin_lock_bh(&le->lock); + if (le->link) { + rc = tipc_link_rcv(le->link, skb, &xmitq); + skb = NULL; + } + spin_unlock_bh(&le->lock); + } + tipc_node_read_unlock(n); + + /* Check/update node state before receiving */ + if (unlikely(skb)) { + tipc_node_write_lock(n); + if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { + if (le->link) { + rc = tipc_link_rcv(le->link, skb, &xmitq); + skb = NULL; + } + } + tipc_node_write_unlock(n); } -unlock: - tipc_node_unlock(n); if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); @@ -1384,15 +1572,15 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - tipc_node_lock(node); + tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; - tipc_node_unlock(node); + tipc_node_read_unlock(node); goto out; } - tipc_node_unlock(node); + tipc_node_read_unlock(node); } done = 1; out: @@ -1402,3 +1590,314 @@ out: return skb->len; } + +/* tipc_node_find_by_name - locate owner node of link by link's name + * @net: the applicable net namespace + * @name: pointer to link name string + * @bearer_id: pointer to index in 'node->links' array where the link was found. + * + * Returns pointer to node owning the link, or 0 if no matching link is found. + */ +static struct tipc_node *tipc_node_find_by_name(struct net *net, + const char *link_name, + unsigned int *bearer_id) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *l; + struct tipc_node *n; + struct tipc_node *found_node = NULL; + int i; + + *bearer_id = 0; + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) { + tipc_node_read_lock(n); + for (i = 0; i < MAX_BEARERS; i++) { + l = n->links[i].link; + if (l && !strcmp(tipc_link_name(l), link_name)) { + *bearer_id = i; + found_node = n; + break; + } + } + tipc_node_read_unlock(n); + if (found_node) + break; + } + rcu_read_unlock(); + + return found_node; +} + +int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) +{ + int err; + int res = 0; + int bearer_id; + char *name; + struct tipc_link *link; + struct tipc_node *node; + struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; + struct net *net = sock_net(skb->sk); + + if (!info->attrs[TIPC_NLA_LINK]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + + name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + + if (strcmp(name, tipc_bclink_name) == 0) + return tipc_nl_bc_link_set(net, attrs); + + node = tipc_node_find_by_name(net, name, &bearer_id); + if (!node) + return -EINVAL; + + tipc_node_read_lock(node); + + link = node->links[bearer_id].link; + if (!link) { + res = -EINVAL; + goto out; + } + + if (attrs[TIPC_NLA_LINK_PROP]) { + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], + props); + if (err) { + res = err; + goto out; + } + + if (props[TIPC_NLA_PROP_TOL]) { + u32 tol; + + tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + tipc_link_set_tolerance(link, tol); + } + if (props[TIPC_NLA_PROP_PRIO]) { + u32 prio; + + prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + tipc_link_set_prio(link, prio); + } + if (props[TIPC_NLA_PROP_WIN]) { + u32 win; + + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, win); + } + } + +out: + tipc_node_read_unlock(node); + + return res; +} + +int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct tipc_nl_msg msg; + char *name; + int err; + + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + if (!info->attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); + + msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg.skb) + return -ENOMEM; + + if (strcmp(name, tipc_bclink_name) == 0) { + err = tipc_nl_add_bc_link(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } else { + int bearer_id; + struct tipc_node *node; + struct tipc_link *link; + + node = tipc_node_find_by_name(net, name, &bearer_id); + if (!node) + return -EINVAL; + + tipc_node_read_lock(node); + link = node->links[bearer_id].link; + if (!link) { + tipc_node_read_unlock(node); + nlmsg_free(msg.skb); + return -EINVAL; + } + + err = __tipc_nl_add_link(net, &msg, link, 0); + tipc_node_read_unlock(node); + if (err) { + nlmsg_free(msg.skb); + return err; + } + } + + return genlmsg_reply(msg.skb, info); +} + +int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *link_name; + unsigned int bearer_id; + struct tipc_link *link; + struct tipc_node *node; + struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_link_entry *le; + + if (!info->attrs[TIPC_NLA_LINK]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + + link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + + if (strcmp(link_name, tipc_bclink_name) == 0) { + err = tipc_bclink_reset_stats(net); + if (err) + return err; + return 0; + } + + node = tipc_node_find_by_name(net, link_name, &bearer_id); + if (!node) + return -EINVAL; + + le = &node->links[bearer_id]; + tipc_node_read_lock(node); + spin_lock_bh(&le->lock); + link = node->links[bearer_id].link; + if (!link) { + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(node); + return -EINVAL; + } + tipc_link_reset_stats(link); + spin_unlock_bh(&le->lock); + tipc_node_read_unlock(node); + return 0; +} + +/* Caller should hold node lock */ +static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, + struct tipc_node *node, u32 *prev_link) +{ + u32 i; + int err; + + for (i = *prev_link; i < MAX_BEARERS; i++) { + *prev_link = i; + + if (!node->links[i].link) + continue; + + err = __tipc_nl_add_link(net, msg, + node->links[i].link, NLM_F_MULTI); + if (err) + return err; + } + *prev_link = 0; + + return 0; +} + +int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_node *node; + struct tipc_nl_msg msg; + u32 prev_node = cb->args[0]; + u32 prev_link = cb->args[1]; + int done = cb->args[2]; + int err; + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + if (prev_node) { + node = tipc_node_find(net, prev_node); + if (!node) { + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the last NLMSG_DONE message + * having the NLM_F_DUMP_INTR flag set. + */ + cb->prev_seq = 1; + goto out; + } + tipc_node_put(node); + + list_for_each_entry_continue_rcu(node, &tn->node_list, + list) { + tipc_node_read_lock(node); + err = __tipc_nl_add_node_links(net, &msg, node, + &prev_link); + tipc_node_read_unlock(node); + if (err) + goto out; + + prev_node = node->addr; + } + } else { + err = tipc_nl_add_bc_link(net, &msg); + if (err) + goto out; + + list_for_each_entry_rcu(node, &tn->node_list, list) { + tipc_node_read_lock(node); + err = __tipc_nl_add_node_links(net, &msg, node, + &prev_link); + tipc_node_read_unlock(node); + if (err) + goto out; + + prev_node = node->addr; + } + } + done = 1; +out: + rcu_read_unlock(); + + cb->args[0] = prev_node; + cb->args[1] = prev_link; + cb->args[2] = done; + + return skb->len; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 6734562d3c6e..f39d9d06e8bb 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -42,23 +42,6 @@ #include "bearer.h" #include "msg.h" -/* Out-of-range value for node signature */ -#define INVALID_NODE_SIG 0x10000 - -#define INVALID_BEARER_ID -1 - -/* Flags used to take different actions according to flag type - * TIPC_NOTIFY_NODE_DOWN: notify node is down - * TIPC_NOTIFY_NODE_UP: notify node is up - * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type - */ -enum { - TIPC_NOTIFY_NODE_DOWN = (1 << 3), - TIPC_NOTIFY_NODE_UP = (1 << 4), - TIPC_NOTIFY_LINK_UP = (1 << 6), - TIPC_NOTIFY_LINK_DOWN = (1 << 7) -}; - /* Optional capabilities supported by this code version */ enum { @@ -66,72 +49,8 @@ enum { }; #define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH +#define INVALID_BEARER_ID -1 -struct tipc_link_entry { - struct tipc_link *link; - u32 mtu; - struct sk_buff_head inputq; - struct tipc_media_addr maddr; -}; - -struct tipc_bclink_entry { - struct tipc_link *link; - struct sk_buff_head inputq1; - struct sk_buff_head arrvq; - struct sk_buff_head inputq2; - struct sk_buff_head namedq; -}; - -/** - * struct tipc_node - TIPC node structure - * @addr: network address of node - * @ref: reference counter to node object - * @lock: spinlock governing access to structure - * @net: the applicable net namespace - * @hash: links to adjacent nodes in unsorted hash chain - * @inputq: pointer to input queue containing messages for msg event - * @namedq: pointer to name table input queue with name table messages - * @active_links: bearer ids of active links, used as index into links[] array - * @links: array containing references to all links to node - * @action_flags: bit mask of different types of node actions - * @state: connectivity state vs peer node - * @sync_point: sequence number where synch/failover is finished - * @list: links to adjacent nodes in sorted list of cluster's nodes - * @working_links: number of working links to node (both active and standby) - * @link_cnt: number of links to node - * @capabilities: bitmap, indicating peer node's functional capabilities - * @signature: node instance identifier - * @link_id: local and remote bearer ids of changing link, if any - * @publ_list: list of publications - * @rcu: rcu struct for tipc_node - */ -struct tipc_node { - u32 addr; - struct kref kref; - spinlock_t lock; - struct net *net; - struct hlist_node hash; - int active_links[2]; - struct tipc_link_entry links[MAX_BEARERS]; - struct tipc_bclink_entry bc_entry; - int action_flags; - struct list_head list; - int state; - u16 sync_point; - int link_cnt; - u16 working_links; - u16 capabilities; - u32 signature; - u32 link_id; - struct list_head publ_list; - struct list_head conn_sks; - unsigned long keepalive_intv; - struct timer_list timer; - struct rcu_head rcu; -}; - -struct tipc_node *tipc_node_find(struct net *net, u32 addr); -void tipc_node_put(struct tipc_node *node); void tipc_node_stop(struct net *net); void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_bearer *bearer, @@ -139,50 +58,22 @@ void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); -void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); -void tipc_node_unlock(struct tipc_node *node); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector); int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); +void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); +void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr); +void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); +int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); - -static inline void tipc_node_lock(struct tipc_node *node) -{ - spin_lock_bh(&node->lock); -} - -static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) -{ - int bearer_id = n->active_links[sel & 1]; - - if (unlikely(bearer_id == INVALID_BEARER_ID)) - return NULL; - - return n->links[bearer_id].link; -} - -static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) -{ - struct tipc_node *n; - int bearer_id; - unsigned int mtu = MAX_MSG_SIZE; - - n = tipc_node_find(net, addr); - if (unlikely(!n)) - return mtu; - - bearer_id = n->active_links[sel & 1]; - if (likely(bearer_id != INVALID_BEARER_ID)) - mtu = n->links[bearer_id].mtu; - tipc_node_put(n); - return mtu; -} +int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 552dbaba9cf3..69c29050f14a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -105,6 +105,7 @@ struct tipc_sock { static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); +static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); @@ -381,6 +382,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; + sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); @@ -470,9 +472,6 @@ static int tipc_release(struct socket *sock) tipc_node_remove_conn(net, dnode, tsk->portid); } - /* Discard any remaining (connection-based) messages in receive queue */ - __skb_queue_purge(&sk->sk_receive_queue); - /* Reject any messages that accumulated in backlog queue */ sock->state = SS_DISCONNECTING; release_sock(sk); @@ -1492,7 +1491,7 @@ static void tipc_write_space(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); rcu_read_unlock(); @@ -1509,12 +1508,17 @@ static void tipc_data_ready(struct sock *sk) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLRDNORM | POLLRDBAND); rcu_read_unlock(); } +static void tipc_sock_destruct(struct sock *sk) +{ + __skb_queue_purge(&sk->sk_receive_queue); +} + /** * filter_connect - Handle all incoming messages for a connection-based socket * @tsk: TIPC socket diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ad2719ad4c1b..d63a911e7fe2 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -48,7 +48,6 @@ #include <linux/tipc_netlink.h> #include "core.h" #include "bearer.h" -#include "msg.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 @@ -158,8 +157,11 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct rtable *rt; - if (skb_headroom(skb) < UDP_MIN_HEADROOM) - pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (skb_headroom(skb) < UDP_MIN_HEADROOM) { + err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (err) + goto tx_error; + } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference_rtnl(b->media_ptr); @@ -180,15 +182,9 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, goto tx_error; } ttl = ip4_dst_hoplimit(&rt->dst); - err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, - src->ipv4.s_addr, - dst->ipv4.s_addr, 0, ttl, 0, - src->udp_port, dst->udp_port, - false, true); - if (err < 0) { - ip_rt_put(rt); - goto tx_error; - } + udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, + dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, + dst->udp_port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { struct dst_entry *ndst; @@ -221,10 +217,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; - int usr = msg_user(buf_msg(skb)); - - if ((usr == LINK_PROTOCOL) || (usr == NAME_DISTRIBUTOR)) - skb_linearize(skb); ub = rcu_dereference_sk_user_data(sk); if (!ub) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aaa0b58d6aba..c5bf5ef2bf89 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -326,6 +326,118 @@ found: return s; } +/* Support code for asymmetrically connected dgram sockets + * + * If a datagram socket is connected to a socket not itself connected + * to the first socket (eg, /dev/log), clients may only enqueue more + * messages if the present receive queue of the server socket is not + * "too large". This means there's a second writeability condition + * poll and sendmsg need to test. The dgram recv code will do a wake + * up on the peer_wait wait queue of a socket upon reception of a + * datagram which needs to be propagated to sleeping would-be writers + * since these might not have sent anything so far. This can't be + * accomplished via poll_wait because the lifetime of the server + * socket might be less than that of its clients if these break their + * association with it or if the server socket is closed while clients + * are still connected to it and there's no way to inform "a polling + * implementation" that it should let go of a certain wait queue + * + * In order to propagate a wake up, a wait_queue_t of the client + * socket is enqueued on the peer_wait queue of the server socket + * whose wake function does a wake_up on the ordinary client socket + * wait queue. This connection is established whenever a write (or + * poll for write) hit the flow control condition and broken when the + * association to the server socket is dissolved or after a wake up + * was relayed. + */ + +static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, + void *key) +{ + struct unix_sock *u; + wait_queue_head_t *u_sleep; + + u = container_of(q, struct unix_sock, peer_wake); + + __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, + q); + u->peer_wake.private = NULL; + + /* relaying can only happen while the wq still exists */ + u_sleep = sk_sleep(&u->sk); + if (u_sleep) + wake_up_interruptible_poll(u_sleep, key); + + return 0; +} + +static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) +{ + struct unix_sock *u, *u_other; + int rc; + + u = unix_sk(sk); + u_other = unix_sk(other); + rc = 0; + spin_lock(&u_other->peer_wait.lock); + + if (!u->peer_wake.private) { + u->peer_wake.private = other; + __add_wait_queue(&u_other->peer_wait, &u->peer_wake); + + rc = 1; + } + + spin_unlock(&u_other->peer_wait.lock); + return rc; +} + +static void unix_dgram_peer_wake_disconnect(struct sock *sk, + struct sock *other) +{ + struct unix_sock *u, *u_other; + + u = unix_sk(sk); + u_other = unix_sk(other); + spin_lock(&u_other->peer_wait.lock); + + if (u->peer_wake.private == other) { + __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); + u->peer_wake.private = NULL; + } + + spin_unlock(&u_other->peer_wait.lock); +} + +static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, + struct sock *other) +{ + unix_dgram_peer_wake_disconnect(sk, other); + wake_up_interruptible_poll(sk_sleep(sk), + POLLOUT | + POLLWRNORM | + POLLWRBAND); +} + +/* preconditions: + * - unix_peer(sk) == other + * - association is stable + */ +static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) +{ + int connected; + + connected = unix_dgram_peer_wake_connect(sk, other); + + if (unix_recvq_full(other)) + return 1; + + if (connected) + unix_dgram_peer_wake_disconnect(sk, other); + + return 0; +} + static int unix_writable(const struct sock *sk) { return sk->sk_state != TCP_LISTEN && @@ -339,7 +451,7 @@ static void unix_write_space(struct sock *sk) rcu_read_lock(); if (unix_writable(sk)) { wq = rcu_dereference(sk->sk_wq); - if (wq_has_sleeper(wq)) + if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); @@ -431,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion) skpair->sk_state_change(skpair); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); } + + unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; } @@ -441,6 +555,7 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); /* passed fds are erased in the kfree_skb hook */ + UNIXCB(skb).consumed = skb->len; kfree_skb(skb); } @@ -665,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); + init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) @@ -837,32 +953,20 @@ fail: return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode, + struct path *res) { - struct dentry *dentry; - struct path path; - int err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - return err; + int err; - /* - * All right, let's create it. - */ - err = security_path_mknod(&path, dentry, mode, 0); + err = security_path_mknod(path, dentry, mode, 0); if (!err) { - err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); + err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path.mnt); + res->mnt = mntget(path->mnt); res->dentry = dget(dentry); } } - done_path_create(&path, dentry); + return err; } @@ -873,10 +977,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - int err; + int err, name_err; unsigned int hash; struct unix_address *addr; struct hlist_head *list; + struct path path; + struct dentry *dentry; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) @@ -892,14 +998,34 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; + name_err = 0; + dentry = NULL; + if (sun_path[0]) { + /* Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + + if (IS_ERR(dentry)) { + /* delay report until after 'already bound' check */ + name_err = PTR_ERR(dentry); + dentry = NULL; + } + } + err = mutex_lock_interruptible(&u->readlock); if (err) - goto out; + goto out_path; err = -EINVAL; if (u->addr) goto out_up; + if (name_err) { + err = name_err == -EEXIST ? -EADDRINUSE : name_err; + goto out_up; + } + err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) @@ -910,11 +1036,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = hash ^ sk->sk_type; atomic_set(&addr->refcnt, 1); - if (sun_path[0]) { - struct path path; + if (dentry) { + struct path u_path; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); + err = unix_mknod(dentry, &path, mode, &u_path); if (err) { if (err == -EEXIST) err = -EADDRINUSE; @@ -922,9 +1048,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1); + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = path; + u->path = u_path; list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); @@ -947,6 +1073,10 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->readlock); +out_path: + if (dentry) + done_path_create(&path, dentry); + out: return err; } @@ -1032,6 +1162,8 @@ restart: if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk) = other; + unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); + unix_state_double_unlock(sk, other); if (other != old_peer) @@ -1381,6 +1513,21 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +/* + * The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + #define MAX_RECURSION_LEVEL 4 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) @@ -1389,6 +1536,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) unsigned char max_level = 0; int unix_sock_count = 0; + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + for (i = scm->fp->count - 1; i >= 0; i--) { struct sock *sk = unix_get_socket(scm->fp->fp[i]); @@ -1410,10 +1560,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - if (unix_sock_count) { - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - } + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); return max_level; } @@ -1433,6 +1581,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +static bool unix_passcred_enabled(const struct socket *sock, + const struct sock *other) +{ + return test_bit(SOCK_PASSCRED, &sock->flags) || + !other->sk_socket || + test_bit(SOCK_PASSCRED, &other->sk_socket->flags); +} + /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket @@ -1443,14 +1599,41 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, { if (UNIXCB(skb).pid) return; - if (test_bit(SOCK_PASSCRED, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) { + if (unix_passcred_enabled(sock, other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } +static int maybe_init_creds(struct scm_cookie *scm, + struct socket *socket, + const struct sock *other) +{ + int err; + struct msghdr msg = { .msg_controllen = 0 }; + + err = scm_send(socket, &msg, scm, false); + if (err) + return err; + + if (unix_passcred_enabled(socket, other)) { + scm->pid = get_pid(task_tgid(current)); + current_uid_gid(&scm->creds.uid, &scm->creds.gid); + } + return err; +} + +static bool unix_skb_scm_eq(struct sk_buff *skb, + struct scm_cookie *scm) +{ + const struct unix_skb_parms *u = &UNIXCB(skb); + + return u->pid == scm->pid && + uid_eq(u->uid, scm->creds.uid) && + gid_eq(u->gid, scm->creds.gid) && + unix_secdata_eq(scm, skb); +} + /* * Send AF_UNIX data. */ @@ -1471,6 +1654,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; int max_level; int data_len = 0; + int sk_locked; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1549,12 +1733,14 @@ restart: goto out_free; } + sk_locked = 0; unix_state_lock(other); +restart_locked: err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; - if (sock_flag(other, SOCK_DEAD)) { + if (unlikely(sock_flag(other, SOCK_DEAD))) { /* * Check with 1003.1g - what should * datagram error @@ -1562,10 +1748,14 @@ restart: unix_state_unlock(other); sock_put(other); + if (!sk_locked) + unix_state_lock(sk); + err = 0; - unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk) = NULL; + unix_dgram_peer_wake_disconnect_wakeup(sk, other); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); @@ -1591,21 +1781,38 @@ restart: goto out_unlock; } - if (unix_peer(other) != sk && unix_recvq_full(other)) { - if (!timeo) { - err = -EAGAIN; - goto out_unlock; + if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + if (timeo) { + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; } - timeo = unix_wait_for_peer(other, timeo); + if (!sk_locked) { + unix_state_unlock(other); + unix_state_double_lock(sk, other); + } - err = sock_intr_errno(timeo); - if (signal_pending(current)) - goto out_free; + if (unix_peer(sk) != other || + unix_dgram_peer_wake_me(sk, other)) { + err = -EAGAIN; + sk_locked = 1; + goto out_unlock; + } - goto restart; + if (!sk_locked) { + sk_locked = 1; + goto restart_locked; + } } + if (unlikely(sk_locked)) + unix_state_unlock(sk); + if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); maybe_add_creds(skb, sock, other); @@ -1619,6 +1826,8 @@ restart: return len; out_unlock: + if (sk_locked) + unix_state_unlock(sk); unix_state_unlock(other); out_free: kfree_skb(skb); @@ -1740,8 +1949,10 @@ out_err: static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page, int offset, size_t size, int flags) { - int err = 0; - bool send_sigpipe = true; + int err; + bool send_sigpipe = false; + bool init_scm = true; + struct scm_cookie scm; struct sock *other, *sk = socket->sk; struct sk_buff *skb, *newskb = NULL, *tail = NULL; @@ -1759,7 +1970,7 @@ alloc_skb: newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT, &err, 0); if (!newskb) - return err; + goto err; } /* we must acquire readlock as we modify already present @@ -1768,12 +1979,12 @@ alloc_skb: err = mutex_lock_interruptible(&unix_sk(other)->readlock); if (err) { err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS; - send_sigpipe = false; goto err; } if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + send_sigpipe = true; goto err_unlock; } @@ -1782,23 +1993,34 @@ alloc_skb: if (sock_flag(other, SOCK_DEAD) || other->sk_shutdown & RCV_SHUTDOWN) { err = -EPIPE; + send_sigpipe = true; goto err_state_unlock; } + if (init_scm) { + err = maybe_init_creds(&scm, socket, other); + if (err) + goto err_state_unlock; + init_scm = false; + } + skb = skb_peek_tail(&other->sk_receive_queue); if (tail && tail == skb) { skb = newskb; - } else if (!skb) { - if (newskb) + } else if (!skb || !unix_skb_scm_eq(skb, &scm)) { + if (newskb) { skb = newskb; - else + } else { + tail = skb; goto alloc_skb; + } } else if (newskb) { /* this is fast path, we don't necessarily need to * call to kfree_skb even though with newskb == NULL * this - does no harm */ consume_skb(newskb); + newskb = NULL; } if (skb_append_pagefrags(skb, page, offset, size)) { @@ -1811,14 +2033,20 @@ alloc_skb: skb->truesize += size; atomic_add(size, &sk->sk_wmem_alloc); - if (newskb) + if (newskb) { + err = unix_scm_to_skb(&scm, skb, false); + if (err) + goto err_state_unlock; + spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, newskb); + spin_unlock(&other->sk_receive_queue.lock); + } unix_state_unlock(other); mutex_unlock(&unix_sk(other)->readlock); other->sk_data_ready(other); - + scm_destroy(&scm); return size; err_state_unlock: @@ -1829,6 +2057,8 @@ err: kfree_skb(newskb); if (send_sigpipe && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); + if (!init_scm) + scm_destroy(&scm); return err; } @@ -1878,8 +2108,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, struct scm_cookie scm; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); - int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; + struct sk_buff *skb, *last; + long timeo; int err; int peeked, skip; @@ -1887,30 +2117,38 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, if (flags&MSG_OOB) goto out; - err = mutex_lock_interruptible(&u->readlock); - if (unlikely(err)) { - /* recvmsg() in non blocking mode is supposed to return -EAGAIN - * sk_rcvtimeo is not honored by mutex_lock_interruptible() - */ - err = noblock ? -EAGAIN : -ERESTARTSYS; - goto out; - } + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skip = sk_peek_offset(sk, flags); + do { + mutex_lock(&u->readlock); - skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err); - if (!skb) { + skip = sk_peek_offset(sk, flags); + skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err, + &last); + if (skb) + break; + + mutex_unlock(&u->readlock); + + if (err != -EAGAIN) + break; + } while (timeo && + !__skb_wait_for_more_packets(sk, &err, &timeo, last)); + + if (!skb) { /* implies readlock unlocked */ unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN)) err = 0; unix_state_unlock(sk); - goto out_unlock; + goto out; } - wake_up_interruptible_sync_poll(&u->peer_wait, - POLLOUT | POLLWRNORM | POLLWRBAND); + if (wq_has_sleeper(&u->peer_wait)) + wake_up_interruptible_sync_poll(&u->peer_wait, + POLLOUT | POLLWRNORM | + POLLWRBAND); if (msg->msg_name) unix_copy_addr(msg, skb->sk); @@ -1962,7 +2200,6 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, out_free: skb_free_datagram(sk, skb); -out_unlock: mutex_unlock(&u->readlock); out: return err; @@ -1991,7 +2228,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, !timeo) break; - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); timeo = freezable_schedule_timeout(timeo); unix_state_lock(sk); @@ -1999,7 +2236,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, if (sock_flag(sk, SOCK_DEAD)) break; - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } finish_wait(sk_sleep(sk), &wait); @@ -2056,14 +2293,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ - err = mutex_lock_interruptible(&u->readlock); - if (unlikely(err)) { - /* recvmsg() in non blocking mode is supposed to return -EAGAIN - * sk_rcvtimeo is not honored by mutex_lock_interruptible() - */ - err = noblock ? -EAGAIN : -ERESTARTSYS; - goto out; - } + mutex_lock(&u->readlock); if (flags & MSG_PEEK) skip = sk_peek_offset(sk, flags); @@ -2072,6 +2302,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) do { int chunk; + bool drop_skb; struct sk_buff *skb, *last; unix_state_lock(sk); @@ -2106,12 +2337,12 @@ again: timeo = unix_stream_data_wait(sk, timeo, last, last_len); - if (signal_pending(current) || - mutex_lock_interruptible(&u->readlock)) { + if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } + mutex_lock(&u->readlock); continue; unlock: unix_state_unlock(sk); @@ -2131,10 +2362,7 @@ unlock: if (check_creds) { /* Never glue messages from different writers */ - if ((UNIXCB(skb).pid != scm.pid) || - !uid_eq(UNIXCB(skb).uid, scm.creds.uid) || - !gid_eq(UNIXCB(skb).gid, scm.creds.gid) || - !unix_secdata_eq(&scm, skb)) + if (!unix_skb_scm_eq(skb, &scm)) break; } else if (test_bit(SOCK_PASSCRED, &sock->flags)) { /* Copy credentials */ @@ -2152,7 +2380,11 @@ unlock: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); + skb_get(skb); chunk = state->recv_actor(skb, skip, chunk, state); + drop_skb = !unix_skb_len(skb); + /* skb is only safe to use if !drop_skb */ + consume_skb(skb); if (chunk < 0) { if (copied == 0) copied = -EFAULT; @@ -2161,6 +2393,18 @@ unlock: copied += chunk; size -= chunk; + if (drop_skb) { + /* the skb was touched by a concurrent reader; + * we should not expect anything from this skb + * anymore and assume it invalid - we can be + * sure it was dropped from the socket queue + * + * let's report a short read + */ + err = 0; + break; + } + /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { UNIXCB(skb).consumed += chunk; @@ -2454,20 +2698,22 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; writable = unix_writable(sk); - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); - if (unix_recvq_full(other)) - writable = 0; - } - sock_put(other); + if (writable) { + unix_state_lock(sk); + + other = unix_peer(sk); + if (other && unix_peer(other) != sk && + unix_recvq_full(other) && + unix_dgram_peer_wake_me(sk, other)) + writable = 0; + + unix_state_unlock(sk); } if (writable) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index a73a226f2d33..8fcdc2283af5 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -120,11 +120,11 @@ void unix_inflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + spin_lock(&unix_gc_lock); + if (s) { struct unix_sock *u = unix_sk(s); - spin_lock(&unix_gc_lock); - if (atomic_long_inc_return(&u->inflight) == 1) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -132,25 +132,28 @@ void unix_inflight(struct file *fp) BUG_ON(list_empty(&u->link)); } unix_tot_inflight++; - spin_unlock(&unix_gc_lock); } + fp->f_cred->user->unix_inflight++; + spin_unlock(&unix_gc_lock); } void unix_notinflight(struct file *fp) { struct sock *s = unix_get_socket(fp); + spin_lock(&unix_gc_lock); + if (s) { struct unix_sock *u = unix_sk(s); - spin_lock(&unix_gc_lock); BUG_ON(list_empty(&u->link)); if (atomic_long_dec_and_test(&u->inflight)) list_del_init(&u->link); unix_tot_inflight--; - spin_unlock(&unix_gc_lock); } + fp->f_cred->user->unix_inflight--; + spin_unlock(&unix_gc_lock); } static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index 2ad46f39649f..1820e74a5752 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -121,7 +121,7 @@ struct vmci_transport { u64 queue_pair_max_size; u32 detach_sub_id; union vmci_transport_notify notify; - struct vmci_transport_notify_ops *notify_ops; + const struct vmci_transport_notify_ops *notify_ops; struct list_head elem; struct sock *sk; spinlock_t lock; /* protects sk. */ diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 9b7f207f2bee..fd8cf0214d51 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -661,7 +661,7 @@ static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk) } /* Socket control packet based operations. */ -struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { +const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = { vmci_transport_notify_pkt_socket_init, vmci_transport_notify_pkt_socket_destruct, vmci_transport_notify_pkt_poll_in, diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h index 7df793249b6c..3c464d394a8f 100644 --- a/net/vmw_vsock/vmci_transport_notify.h +++ b/net/vmw_vsock/vmci_transport_notify.h @@ -77,7 +77,8 @@ struct vmci_transport_notify_ops { void (*process_negotiate) (struct sock *sk); }; -extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops; -extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops; +extern const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops; +extern const +struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops; #endif /* __VMCI_TRANSPORT_NOTIFY_H__ */ diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index dc9c7929a2f9..21e591dafb03 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -419,7 +419,7 @@ vmci_transport_notify_pkt_send_pre_enqueue( } /* Socket always on control packet based operations. */ -struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { +const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = { vmci_transport_notify_pkt_socket_init, vmci_transport_notify_pkt_socket_destruct, vmci_transport_notify_pkt_poll_in, diff --git a/net/wireless/core.h b/net/wireless/core.h index a618b4b86fa4..022ccad06cbe 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -416,13 +416,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wdev_events(struct wireless_dev *wdev); -int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_iftype iftype, - struct ieee80211_channel *chan, - enum cfg80211_chan_mode chanmode, - u8 radar_detect); - /** * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable * @wiphy: the wiphy to validate against diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index dc0e59e53dbf..6beab0cfcb99 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -311,8 +311,8 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) } keyidx >>= 6; if (key->key_idx != keyidx) { - printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame " - "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv); + net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n", + key->key_idx, keyidx); return -6; } if (!key->key_set) { diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 8c90ba79e56e..3cd819539241 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -434,8 +434,8 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) } keyidx >>= 6; if (tkey->key_idx != keyidx) { - printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame " - "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv); + net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n", + tkey->key_idx, keyidx); return -6; } if (!tkey->key_set) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c71e274c810a..d4786f2802aa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4256,8 +4256,8 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) * station. Include these parameters here and will check them in * cfg80211_check_station_change(). */ - if (info->attrs[NL80211_ATTR_PEER_AID]) - params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); + if (info->attrs[NL80211_ATTR_STA_AID]) + params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]) params.listen_interval = @@ -4359,6 +4359,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct station_parameters params; u8 *mac_addr = NULL; + u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) | + BIT(NL80211_STA_FLAG_ASSOCIATED); memset(¶ms, 0, sizeof(params)); @@ -4470,10 +4472,23 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) /* allow authenticated/associated only if driver handles it */ if (!(rdev->wiphy.features & NL80211_FEATURE_FULL_AP_CLIENT_STATE) && - params.sta_flags_mask & - (BIT(NL80211_STA_FLAG_AUTHENTICATED) | - BIT(NL80211_STA_FLAG_ASSOCIATED))) - return -EINVAL; + params.sta_flags_mask & auth_assoc) + return -EINVAL; + + /* Older userspace, or userspace wanting to be compatible with + * !NL80211_FEATURE_FULL_AP_CLIENT_STATE, will not set the auth + * and assoc flags in the mask, but assumes the station will be + * added as associated anyway since this was the required driver + * behaviour before NL80211_FEATURE_FULL_AP_CLIENT_STATE was + * introduced. + * In order to not bother drivers with this quirk in the API + * set the flags in both the mask and set for new stations in + * this case. + */ + if (!(params.sta_flags_mask & auth_assoc)) { + params.sta_flags_mask |= auth_assoc; + params.sta_flags_set |= auth_assoc; + } /* must be last in here for error handling */ params.vlan = get_vlan(info, rdev); @@ -5997,6 +6012,24 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_abort_scan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + + if (!rdev->ops->abort_scan) + return -EOPNOTSUPP; + + if (rdev->scan_msg) + return 0; + + if (!rdev->scan_req) + return -ENOENT; + + rdev_abort_scan(rdev, wdev); + return 0; +} + static int nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, struct cfg80211_sched_scan_request *request, @@ -6507,8 +6540,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (WARN_ON(!cac_time_ms)) cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; - err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef, - cac_time_ms); + err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms); if (!err) { wdev->chandef = chandef; wdev->cac_started = true; @@ -7571,7 +7603,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) if (!nl80211_parse_mcast_rate(rdev, mcast_rate, nla_rate)) return -EINVAL; - err = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); + err = rdev_set_mcast_rate(rdev, dev, mcast_rate); return err; } @@ -7941,8 +7973,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { if (!(rdev->wiphy.features & NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || - !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) { + kzfree(connkeys); return -EINVAL; + } connect.flags |= ASSOC_REQ_USE_RRM; } @@ -9503,6 +9537,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) if (new_triggers.tcp && new_triggers.tcp->sock) sock_release(new_triggers.tcp->sock); kfree(new_triggers.tcp); + kfree(new_triggers.nd_config); return err; } #endif @@ -9716,7 +9751,7 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) { cfg80211_rdev_free_coalesce(rdev); - rdev->ops->set_coalesce(&rdev->wiphy, NULL); + rdev_set_coalesce(rdev, NULL); return 0; } @@ -9744,7 +9779,7 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) i++; } - err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce); + err = rdev_set_coalesce(rdev, &new_coalesce); if (err) goto error; @@ -10946,6 +10981,14 @@ static const struct genl_ops nl80211_ops[] = { NL80211_FLAG_NEED_RTNL, }, { + .cmd = NL80211_CMD_ABORT_SCAN, + .doit = nl80211_abort_scan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { .cmd = NL80211_CMD_GET_SCAN, .policy = nl80211_policy, .dumpit = nl80211_dump_scan, diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c index c00d4a792319..e64dbf16330c 100644 --- a/net/wireless/ocb.c +++ b/net/wireless/ocb.c @@ -29,6 +29,9 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev, if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) return -EOPNOTSUPP; + if (!rdev->ops->join_ocb) + return -EOPNOTSUPP; + if (WARN_ON(!setup->chandef.chan)) return -EINVAL; diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index c23516d0f807..8ae0c04f9fc7 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -427,6 +427,14 @@ static inline int rdev_scan(struct cfg80211_registered_device *rdev, return ret; } +static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + trace_rdev_abort_scan(&rdev->wiphy, wdev); + rdev->ops->abort_scan(&rdev->wiphy, wdev); + trace_rdev_return_void(&rdev->wiphy); +} + static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) @@ -1020,4 +1028,47 @@ rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int +rdev_start_radar_detection(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_chan_def *chandef, + u32 cac_time_ms) +{ + int ret = -ENOTSUPP; + + trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, + cac_time_ms); + if (rdev->ops->start_radar_detection) + ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, + chandef, cac_time_ms); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int +rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, + struct net_device *dev, + int mcast_rate[IEEE80211_NUM_BANDS]) +{ + int ret = -ENOTSUPP; + + trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); + if (rdev->ops->set_mcast_rate) + ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int +rdev_set_coalesce(struct cfg80211_registered_device *rdev, + struct cfg80211_coalesce *coalesce) +{ + int ret = -ENOTSUPP; + + trace_rdev_set_coalesce(&rdev->wiphy, coalesce); + if (rdev->ops->set_coalesce) + ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2e8d6f39ed56..3b0ce1c484a3 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1052,7 +1052,7 @@ static u32 map_regdom_flags(u32 rd_flags) } static const struct ieee80211_reg_rule * -freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, +freq_reg_info_regd(u32 center_freq, const struct ieee80211_regdomain *regd, u32 bw) { int i; @@ -1097,7 +1097,7 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw) u32 bw; for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { - reg_rule = freq_reg_info_regd(wiphy, center_freq, regd, bw); + reg_rule = freq_reg_info_regd(center_freq, regd, bw); if (!IS_ERR(reg_rule)) return reg_rule; } @@ -1166,6 +1166,41 @@ static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd, #endif } +static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd, + const struct ieee80211_reg_rule *reg_rule, + const struct ieee80211_channel *chan) +{ + const struct ieee80211_freq_range *freq_range = NULL; + u32 max_bandwidth_khz, bw_flags = 0; + + freq_range = ®_rule->freq_range; + + max_bandwidth_khz = freq_range->max_bandwidth_khz; + /* Check if auto calculation requested */ + if (reg_rule->flags & NL80211_RRF_AUTO_BW) + max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + + /* If we get a reg_rule we can assume that at least 5Mhz fit */ + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + return bw_flags; +} + /* * Note that right now we assume the desired channel bandwidth * is always 20 MHz for each individual channel (HT40 uses 20 MHz @@ -1178,11 +1213,9 @@ static void handle_channel(struct wiphy *wiphy, u32 flags, bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; - const struct ieee80211_freq_range *freq_range = NULL; struct wiphy *request_wiphy = NULL; struct regulatory_request *lr = get_last_request(); const struct ieee80211_regdomain *regd; - u32 max_bandwidth_khz; request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx); @@ -1223,31 +1256,7 @@ static void handle_channel(struct wiphy *wiphy, chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; - freq_range = ®_rule->freq_range; - - max_bandwidth_khz = freq_range->max_bandwidth_khz; - /* Check if auto calculation requested */ - if (reg_rule->flags & NL80211_RRF_AUTO_BW) - max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); - - /* If we get a reg_rule we can assume that at least 5Mhz fit */ - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(10))) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(20))) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; + bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && @@ -1760,13 +1769,10 @@ static void handle_channel_custom(struct wiphy *wiphy, u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; - const struct ieee80211_freq_range *freq_range = NULL; - u32 max_bandwidth_khz; u32 bw; for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) { - reg_rule = freq_reg_info_regd(wiphy, - MHZ_TO_KHZ(chan->center_freq), + reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), regd, bw); if (!IS_ERR(reg_rule)) break; @@ -1787,31 +1793,7 @@ static void handle_channel_custom(struct wiphy *wiphy, chan_reg_rule_print_dbg(regd, chan, reg_rule); power_rule = ®_rule->power_rule; - freq_range = ®_rule->freq_range; - - max_bandwidth_khz = freq_range->max_bandwidth_khz; - /* Check if auto calculation requested */ - if (reg_rule->flags & NL80211_RRF_AUTO_BW) - max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); - - /* If we get a reg_rule we can assume that at least 5Mhz fit */ - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(10))) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(20))) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; + bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan); chan->dfs_state_entered = jiffies; chan->dfs_state = NL80211_DFS_USABLE; @@ -3029,6 +3011,7 @@ int set_regdom(const struct ieee80211_regdomain *rd, break; default: WARN(1, "invalid initiator %d\n", lr->initiator); + kfree(rd); return -EINVAL; } @@ -3221,8 +3204,10 @@ int __init regulatory_init(void) /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { - if (err == -ENOMEM) + if (err == -ENOMEM) { + platform_device_unregister(reg_pdev); return err; + } /* * N.B. kobject_uevent_env() can fail mainly for when we're out * memory which is handled and propagated appropriately above diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 0c392d36781b..09b242b09bed 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -623,12 +623,24 @@ DECLARE_EVENT_CLASS(station_add_change, __field(u32, sta_flags_set) __field(u32, sta_modify_mask) __field(int, listen_interval) + __field(u16, capability) __field(u16, aid) __field(u8, plink_action) __field(u8, plink_state) __field(u8, uapsd_queues) + __field(u8, max_sp) + __field(u8, opmode_notif) + __field(bool, opmode_notif_used) __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) + __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) __array(char, vlan, IFNAMSIZ) + __dynamic_array(u8, supported_rates, + params->supported_rates_len) + __dynamic_array(u8, ext_capab, params->ext_capab_len) + __dynamic_array(u8, supported_channels, + params->supported_channels_len) + __dynamic_array(u8, supported_oper_classes, + params->supported_oper_classes_len) ), TP_fast_assign( WIPHY_ASSIGN; @@ -646,9 +658,35 @@ DECLARE_EVENT_CLASS(station_add_change, if (params->ht_capa) memcpy(__entry->ht_capa, params->ht_capa, sizeof(struct ieee80211_ht_cap)); + memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); + if (params->vht_capa) + memcpy(__entry->vht_capa, params->vht_capa, + sizeof(struct ieee80211_vht_cap)); memset(__entry->vlan, 0, sizeof(__entry->vlan)); if (params->vlan) memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ); + if (params->supported_rates && params->supported_rates_len) + memcpy(__get_dynamic_array(supported_rates), + params->supported_rates, + params->supported_rates_len); + if (params->ext_capab && params->ext_capab_len) + memcpy(__get_dynamic_array(ext_capab), + params->ext_capab, + params->ext_capab_len); + if (params->supported_channels && + params->supported_channels_len) + memcpy(__get_dynamic_array(supported_channels), + params->supported_channels, + params->supported_channels_len); + if (params->supported_oper_classes && + params->supported_oper_classes_len) + memcpy(__get_dynamic_array(supported_oper_classes), + params->supported_oper_classes, + params->supported_oper_classes_len); + __entry->max_sp = params->max_sp; + __entry->capability = params->capability; + __entry->opmode_notif = params->opmode_notif; + __entry->opmode_notif_used = params->opmode_notif_used; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", station flags mask: %u, station flags set: %u, " @@ -2818,6 +2856,71 @@ TRACE_EVENT(cfg80211_stop_iface, WIPHY_PR_ARG, WDEV_PR_ARG) ); +TRACE_EVENT(rdev_start_radar_detection, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_chan_def *chandef, + u32 cac_time_ms), + TP_ARGS(wiphy, netdev, chandef, cac_time_ms), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + CHAN_DEF_ENTRY + __field(u32, cac_time_ms) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + __entry->cac_time_ms = cac_time_ms; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT + ", cac_time_ms=%u", + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, + __entry->cac_time_ms) +); + +TRACE_EVENT(rdev_set_mcast_rate, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + int mcast_rate[IEEE80211_NUM_BANDS]), + TP_ARGS(wiphy, netdev, mcast_rate), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __array(int, mcast_rate, IEEE80211_NUM_BANDS) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + memcpy(__entry->mcast_rate, mcast_rate, + sizeof(int) * IEEE80211_NUM_BANDS); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " + "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->mcast_rate[IEEE80211_BAND_2GHZ], + __entry->mcast_rate[IEEE80211_BAND_5GHZ], + __entry->mcast_rate[IEEE80211_BAND_60GHZ]) +); + +TRACE_EVENT(rdev_set_coalesce, + TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce), + TP_ARGS(wiphy, coalesce), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(int, n_rules) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->n_rules = coalesce ? coalesce->n_rules : 0; + ), + TP_printk(WIPHY_PR_FMT ", n_rules=%d", + WIPHY_PR_ARG, __entry->n_rules) +); + +DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index baf7218cec15..92770427b211 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1325,13 +1325,6 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, } EXPORT_SYMBOL(ieee80211_ie_split_ric); -size_t ieee80211_ie_split(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, size_t offset) -{ - return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset); -} -EXPORT_SYMBOL(ieee80211_ie_split); - bool ieee80211_operating_class_to_band(u8 operating_class, enum ieee80211_band *band) { @@ -1620,120 +1613,6 @@ int cfg80211_check_combinations(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_check_combinations); -int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_iftype iftype, - struct ieee80211_channel *chan, - enum cfg80211_chan_mode chanmode, - u8 radar_detect) -{ - struct wireless_dev *wdev_iter; - int num[NUM_NL80211_IFTYPES]; - struct ieee80211_channel - *used_channels[CFG80211_MAX_NUM_DIFFERENT_CHANNELS]; - struct ieee80211_channel *ch; - enum cfg80211_chan_mode chmode; - int num_different_channels = 0; - int total = 1; - int i; - - ASSERT_RTNL(); - - if (WARN_ON(hweight32(radar_detect) > 1)) - return -EINVAL; - - if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) - return -EINVAL; - - /* Always allow software iftypes */ - if (rdev->wiphy.software_iftypes & BIT(iftype)) { - if (radar_detect) - return -EINVAL; - return 0; - } - - memset(num, 0, sizeof(num)); - memset(used_channels, 0, sizeof(used_channels)); - - num[iftype] = 1; - - /* TODO: We'll probably not need this anymore, since this - * should only be called with CHAN_MODE_UNDEFINED. There are - * still a couple of pending calls where other chanmodes are - * used, but we should get rid of them. - */ - switch (chanmode) { - case CHAN_MODE_UNDEFINED: - break; - case CHAN_MODE_SHARED: - WARN_ON(!chan); - used_channels[0] = chan; - num_different_channels++; - break; - case CHAN_MODE_EXCLUSIVE: - num_different_channels++; - break; - } - - list_for_each_entry(wdev_iter, &rdev->wdev_list, list) { - if (wdev_iter == wdev) - continue; - if (wdev_iter->iftype == NL80211_IFTYPE_P2P_DEVICE) { - if (!wdev_iter->p2p_started) - continue; - } else if (wdev_iter->netdev) { - if (!netif_running(wdev_iter->netdev)) - continue; - } else { - WARN_ON(1); - } - - if (rdev->wiphy.software_iftypes & BIT(wdev_iter->iftype)) - continue; - - /* - * We may be holding the "wdev" mutex, but now need to lock - * wdev_iter. This is OK because once we get here wdev_iter - * is not wdev (tested above), but we need to use the nested - * locking for lockdep. - */ - mutex_lock_nested(&wdev_iter->mtx, 1); - __acquire(wdev_iter->mtx); - cfg80211_get_chan_state(wdev_iter, &ch, &chmode, &radar_detect); - wdev_unlock(wdev_iter); - - switch (chmode) { - case CHAN_MODE_UNDEFINED: - break; - case CHAN_MODE_SHARED: - for (i = 0; i < CFG80211_MAX_NUM_DIFFERENT_CHANNELS; i++) - if (!used_channels[i] || used_channels[i] == ch) - break; - - if (i == CFG80211_MAX_NUM_DIFFERENT_CHANNELS) - return -EBUSY; - - if (used_channels[i] == NULL) { - used_channels[i] = ch; - num_different_channels++; - } - break; - case CHAN_MODE_EXCLUSIVE: - num_different_channels++; - break; - } - - num[wdev_iter->iftype]++; - total++; - } - - if (total == 1 && !radar_detect) - return 0; - - return cfg80211_check_combinations(&rdev->wiphy, num_different_channels, - radar_detect, num); -} - int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index cc3676eb6239..ff4a91fcab9f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -167,6 +167,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb { struct sk_buff *segs; + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 09bfcbac63bb..b5e665b3cfb0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -303,6 +303,14 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) } EXPORT_SYMBOL(xfrm_policy_alloc); +static void xfrm_policy_destroy_rcu(struct rcu_head *head) +{ + struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); + + security_xfrm_policy_free(policy->security); + kfree(policy); +} + /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) @@ -312,8 +320,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); - security_xfrm_policy_free(policy->security); - kfree(policy); + call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); @@ -1214,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, struct xfrm_policy *pol; struct net *net = sock_net(sk); + rcu_read_lock(); read_lock_bh(&net->xfrm.xfrm_policy_lock); - if ((pol = sk->sk_policy[dir]) != NULL) { + pol = rcu_dereference(sk->sk_policy[dir]); + if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); int err = 0; @@ -1239,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, } out: read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return pol; } @@ -1307,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) #endif write_lock_bh(&net->xfrm.xfrm_policy_lock); - old_pol = sk->sk_policy[dir]; - sk->sk_policy[dir] = pol; + old_pol = rcu_dereference_protected(sk->sk_policy[dir], + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } + rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); @@ -1361,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) return newp; } -int __xfrm_sk_clone_policy(struct sock *sk) +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - struct xfrm_policy *p0 = sk->sk_policy[0], - *p1 = sk->sk_policy[1]; + const struct xfrm_policy *p; + struct xfrm_policy *np; + int i, ret = 0; - sk->sk_policy[0] = sk->sk_policy[1] = NULL; - if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) - return -ENOMEM; - if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) - return -ENOMEM; - return 0; + rcu_read_lock(); + for (i = 0; i < 2; i++) { + p = rcu_dereference(osk->sk_policy[i]); + if (p) { + np = clone_policy(p, i); + if (unlikely(!np)) { + ret = -ENOMEM; + break; + } + rcu_assign_pointer(sk->sk_policy[i], np); + } + } + rcu_read_unlock(); + return ret; } static int @@ -2198,6 +2218,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, xdst = NULL; route = NULL; + sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); @@ -2477,6 +2498,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } pol = NULL; + sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl); if (IS_ERR(pol)) { @@ -2804,7 +2826,6 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { - struct net *net; int err = 0; if (unlikely(afinfo == NULL)) return -EINVAL; @@ -2835,26 +2856,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) } spin_unlock(&xfrm_policy_afinfo_lock); - rtnl_lock(); - for_each_net(net) { - struct dst_ops *xfrm_dst_ops; - - switch (afinfo->family) { - case AF_INET: - xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops; - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops; - break; -#endif - default: - BUG(); - } - *xfrm_dst_ops = *afinfo->dst_ops; - } - rtnl_unlock(); - return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); @@ -2890,22 +2891,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); -static void __net_init xfrm_dst_ops_init(struct net *net) -{ - struct xfrm_policy_afinfo *afinfo; - - rcu_read_lock(); - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]); - if (afinfo) - net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; -#if IS_ENABLED(CONFIG_IPV6) - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]); - if (afinfo) - net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; -#endif - rcu_read_unlock(); -} - static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -3054,7 +3039,6 @@ static int __net_init xfrm_net_init(struct net *net) rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; - xfrm_dst_ops_init(net); rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; |