diff options
Diffstat (limited to 'net')
582 files changed, 16449 insertions, 10807 deletions
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c index 4fa2fdda174d..9e56fb98f33c 100644 --- a/net/6lowpan/nhc.c +++ b/net/6lowpan/nhc.c @@ -18,7 +18,7 @@ #include "nhc.h" static struct rb_root rb_root = RB_ROOT; -static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX]; +static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1]; static DEFINE_SPINLOCK(lowpan_nhc_lock); static int lowpan_nhc_insert(struct lowpan_nhc *nhc) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index dc4411165e43..1f99678751df 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -75,6 +75,14 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, return 0; } +static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev, + struct vlan_dev_priv *vlan) +{ + if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) + netif_stacked_transfer_operstate(rootdev, dev); +} + void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -180,7 +188,7 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) /* Account for reference in struct vlan_dev_priv */ dev_hold(real_dev); - netif_stacked_transfer_operstate(real_dev, dev); + vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place @@ -399,7 +407,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, + vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: @@ -446,7 +455,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, + vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); @@ -463,7 +473,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); - netif_stacked_transfer_operstate(dev, vlandev); + vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 15293c2a5dd8..2a9a60733594 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -223,7 +223,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); @@ -296,7 +297,8 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); - if (netif_carrier_ok(real_dev)) + if (netif_carrier_ok(real_dev) && + !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; @@ -326,7 +328,8 @@ static int vlan_dev_stop(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); - netif_carrier_off(dev); + if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) + netif_carrier_off(dev); return 0; } @@ -367,10 +370,12 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { + case SIOCSHWTSTAMP: + if (!net_eq(dev_net(dev), &init_net)) + break; case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - case SIOCSHWTSTAMP: case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_do_ioctl) err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); @@ -443,27 +448,29 @@ static int vlan_dev_fcoe_disable(struct net_device *dev) return rc; } -static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) +static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, + struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; - int rc = -EINVAL; + int rc = 0; + + if (ops->ndo_fcoe_ddp_target) + rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); - if (ops->ndo_fcoe_get_wwn) - rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } +#endif -static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, - struct scatterlist *sgl, unsigned int sgc) +#ifdef NETDEV_FCOE_WWNN +static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; - int rc = 0; - - if (ops->ndo_fcoe_ddp_target) - rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); + int rc = -EINVAL; + if (ops->ndo_fcoe_get_wwn) + rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } #endif @@ -548,7 +555,8 @@ static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); @@ -559,6 +567,9 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); + if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) + dev->state |= (1 << __LINK_STATE_NOCARRIER); + dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | @@ -589,8 +600,7 @@ static int vlan_dev_init(struct net_device *dev) #endif dev->needed_headroom = real_dev->needed_headroom; - if (vlan_hw_offload_capable(real_dev->features, - vlan_dev_priv(dev)->vlan_proto)) { + if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { @@ -604,8 +614,8 @@ static int vlan_dev_init(struct net_device *dev) vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev)); - vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); - if (!vlan_dev_priv(dev)->vlan_pcpu_stats) + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); + if (!vlan->vlan_pcpu_stats) return -ENOMEM; return 0; @@ -794,9 +804,11 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, .ndo_fcoe_enable = vlan_dev_fcoe_enable, .ndo_fcoe_disable = vlan_dev_fcoe_disable, - .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif +#ifdef NETDEV_FCOE_WWNN + .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, +#endif #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = vlan_dev_poll_controller, .ndo_netpoll_setup = vlan_dev_netpoll_setup, diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 9b60c1e399e2..24eebbc92364 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -35,8 +35,8 @@ static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; - return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, - NULL); + return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX, + vlan_map_policy, NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -84,7 +84,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) { + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | + VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } @@ -226,7 +227,7 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } if (vlan->nr_ingress_mappings) { - nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS); + nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS); if (nest == NULL) goto nla_put_failure; @@ -244,7 +245,7 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) } if (vlan->nr_egress_mappings) { - nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS); + nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS); if (nest == NULL) goto nla_put_failure; diff --git a/net/Kconfig b/net/Kconfig index 1efe1f9ee492..3e8fdd688329 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -429,11 +429,8 @@ config NET_SOCK_MSG with the help of BPF programs. config NET_DEVLINK - bool "Network physical/parent device Netlink interface" - help - Network physical/parent device Netlink interface provides - infrastructure to support access to physical chip-wide config and - monitoring. + bool + default n config PAGE_POOL bool diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 709d2542f729..a2555023c654 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1806,12 +1806,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = put_user(amount, (int __user *)argp); break; } - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; - case SIOCGSTAMPNS: - rc = sock_get_timestampns(sk, argp); - break; /* Routing */ case SIOCADDRT: case SIOCDELRT: @@ -1871,6 +1865,7 @@ static const struct proto_ops atalk_dgram_ops = { .getname = atalk_getname, .poll = datagram_poll, .ioctl = atalk_ioctl, + .gettstamp = sock_gettstamp, #ifdef CONFIG_COMPAT .compat_ioctl = atalk_compat_ioctl, #endif @@ -1920,6 +1915,7 @@ static int __init atalk_init(void) ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv); if (!ddp_dl) { pr_crit("Unable to register DDP with SNAP.\n"); + rc = -ENOMEM; goto out_sock; } diff --git a/net/atm/clip.c b/net/atm/clip.c index d795b9c5aea4..b9e67e589a7b 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,8 +345,8 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } rt = (struct rtable *) dst; - if (rt->rt_gateway) - daddr = &rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index 2ff0e5e470e3..d955b683aa7c 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -81,22 +81,6 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, (int __user *)argp) ? -EFAULT : 0; goto done; } - case SIOCGSTAMP: /* borrowed from IP */ -#ifdef CONFIG_COMPAT - if (compat) - error = compat_sock_get_timestamp(sk, argp); - else -#endif - error = sock_get_timestamp(sk, argp); - goto done; - case SIOCGSTAMPNS: /* borrowed from IP */ -#ifdef CONFIG_COMPAT - if (compat) - error = compat_sock_get_timestampns(sk, argp); - else -#endif - error = sock_get_timestampns(sk, argp); - goto done; case ATM_SETSC: net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n", current->comm, task_pid_nr(current)); diff --git a/net/atm/lec.c b/net/atm/lec.c index d7f5cf5b7594..a0311493b01b 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -710,7 +710,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { - if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg]) + if (arg < 0 || arg >= MAX_LEC_ITF) + return -EINVAL; + arg = array_index_nospec(arg, MAX_LEC_ITF); + if (!dev_lec[arg]) return -EINVAL; vcc->proto_data = dev_lec[arg]; return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); @@ -723,11 +726,10 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) struct lec_priv *priv; if (arg < 0) - i = 0; - else - i = arg; + arg = 0; if (arg >= MAX_LEC_ITF) return -EINVAL; + i = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[i]) { int size; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2cb10af16afc..02bd2a436bdf 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -118,6 +118,7 @@ static const struct proto_ops pvc_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = pvc_shutdown, .setsockopt = pvc_setsockopt, diff --git a/net/atm/svc.c b/net/atm/svc.c index 2f91b766ac42..908cbb8654f5 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -641,6 +641,7 @@ static const struct proto_ops svc_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = svc_listen, .shutdown = svc_shutdown, .setsockopt = svc_setsockopt, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 5d01edf8d819..012c0b6fc4f6 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1714,14 +1714,6 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } - case SIOCGSTAMP: - res = sock_get_timestamp(sk, argp); - break; - - case SIOCGSTAMPNS: - res = sock_get_timestampns(sk, argp); - break; - case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ case SIOCAX25GETUID: { @@ -1888,8 +1880,8 @@ static int ax25_info_show(struct seq_file *seq, void *v) * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode */ - seq_printf(seq, "%8.8lx %s %s%s ", - (long) ax25, + seq_printf(seq, "%p %s %s%s ", + ax25, ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name, ax2asc(buf, &ax25->source_addr), ax25->iamdigi? "*":""); @@ -1950,6 +1942,7 @@ static const struct proto_ops ax25_proto_ops = { .getname = ax25_getname, .poll = datagram_poll, .ioctl = ax25_ioctl, + .gettstamp = sock_gettstamp, .listen = ax25_listen, .shutdown = ax25_shutdown, .setsockopt = ax25_setsockopt, diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index a31db5e9ac8e..a3d188dfbe75 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -2,18 +2,6 @@ # Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. # # B.A.T.M.A.N meshing protocol @@ -109,6 +97,18 @@ config BATMAN_ADV_DEBUG buffer. The output is controlled via the batadv netdev specific log_level setting. +config BATMAN_ADV_SYSFS + bool "batman-adv sysfs entries" + depends on BATMAN_ADV + default y + help + Say Y here if you want to enable batman-adv device configuration and + status interface through sysfs attributes. It is replaced by the + batadv generic netlink family but still used by various userspace + tools and scripts. + + If unsure, say Y. + config BATMAN_ADV_TRACING bool "B.A.T.M.A.N. tracing support" depends on BATMAN_ADV diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index a887ecc3efa1..fd63e116d9ff 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -2,19 +2,6 @@ # Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, see <http://www.gnu.org/licenses/>. -# obj-$(CONFIG_BATMAN_ADV) += batman-adv.o batman-adv-y += bat_algo.o @@ -41,7 +28,7 @@ batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o batman-adv-y += soft-interface.o -batman-adv-y += sysfs.o +batman-adv-$(CONFIG_BATMAN_ADV_SYSFS) += sysfs.o batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o batman-adv-y += tp_meter.o batman-adv-y += translation-table.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 7b7e15641fef..fa39eaaab9d7 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 25e7bb51928c..cb7d57d16c9d 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_ALGO_H_ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index de61091af666..bd4138ddf7e0 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bat_iv_ogm.h" diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 785f6666273c..c7a9ba305bfc 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_ diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 445594ed58af..231b4aab4d8d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bat_v.h" diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index 465a4fc23354..37833db098e6 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_V_H_ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a9b7919c9de5..2614a9caee00 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bat_v_elp.h" @@ -104,8 +92,10 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo); - /* free the TID stats immediately */ - cfg80211_sinfo_release_content(&sinfo); + if (!ret) { + /* free the TID stats immediately */ + cfg80211_sinfo_release_content(&sinfo); + } dev_put(real_netdev); if (ret == -ENOENT) { diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 75f189ee4a1c..bb3d40f73bfe 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index c9698ad41854..fad95ef64e01 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bat_v_ogm.h" diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index f67cf7ee06b2..616bf2ea8755 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_ diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 63e134e763e3..7f04a6acf14e 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bitarray.h" diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index f3a05ad9afad..84ad2d2b6ac9 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ef39aabdb694..663a53b6d36e 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "bridge_loop_avoidance.h" @@ -59,7 +47,6 @@ #include "netlink.h" #include "originator.h" #include "soft-interface.h" -#include "sysfs.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; @@ -803,6 +790,8 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; + struct batadv_bla_claim *claim_removed_entry; + struct hlist_node *claim_removed_node; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; @@ -813,10 +802,18 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, mac, batadv_print_vid(vid)); - batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, - batadv_choose_claim, claim); - batadv_claim_put(claim); /* reference from the hash is gone */ + claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash, + batadv_compare_claim, + batadv_choose_claim, claim); + if (!claim_removed_node) + goto free_claim; + + /* reference from the hash is gone */ + claim_removed_entry = hlist_entry(claim_removed_node, + struct batadv_bla_claim, hash_entry); + batadv_claim_put(claim_removed_entry); +free_claim: /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 31771c751efb..012d72c8d064 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_BLA_H_ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 3b9d1ad2f467..d38d70ccdd5a 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "debugfs.h" diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index c0b8694041ec..7fac680cf740 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_DEBUGFS_H_ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 310a4f353008..b0af3a11d406 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "distributed-arp-table.h" @@ -667,7 +655,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, } /** - * batadv_dat_send_data() - send a payload to the selected candidates + * batadv_dat_forward_data() - copy and send payload to the selected candidates * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @ip: the DHT key @@ -680,9 +668,9 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * Return: true if the packet is sent to at least one candidate, false * otherwise. */ -static bool batadv_dat_send_data(struct batadv_priv *bat_priv, - struct sk_buff *skb, __be32 ip, - unsigned short vid, int packet_subtype) +static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, + struct sk_buff *skb, __be32 ip, + unsigned short vid, int packet_subtype) { int i; bool ret = false; @@ -1277,8 +1265,8 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, ret = true; } else { /* Send the request to the DHT */ - ret = batadv_dat_send_data(bat_priv, skb, ip_dst, vid, - BATADV_P_DAT_DHT_GET); + ret = batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, + BATADV_P_DAT_DHT_GET); } out: if (dat_entry) @@ -1392,8 +1380,10 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, /* Send the ARP reply to the candidates for both the IP addresses that * the node obtained from the ARP reply */ - batadv_dat_send_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT); - batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); + batadv_dat_forward_data(bat_priv, skb, ip_src, vid, + BATADV_P_DAT_DHT_PUT); + batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, + BATADV_P_DAT_DHT_PUT); } /** @@ -1444,7 +1434,6 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, hw_src, &ip_src, hw_dst, &ip_dst, dat_entry->mac_addr, &dat_entry->ip); dropped = true; - goto out; } /* Update our internal cache with both the IP addresses the node got @@ -1453,6 +1442,9 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); + if (dropped) + goto out; + /* If BLA is enabled, only forward ARP replies if we have claimed the * source of the ARP reply or if no one else of the same backbone has * already claimed that client. This prevents that different gateways @@ -1708,8 +1700,10 @@ static void batadv_dat_put_dhcp(struct batadv_priv *bat_priv, u8 *chaddr, batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); - batadv_dat_send_data(bat_priv, skb, yiaddr, vid, BATADV_P_DAT_DHT_PUT); - batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); + batadv_dat_forward_data(bat_priv, skb, yiaddr, vid, + BATADV_P_DAT_DHT_PUT); + batadv_dat_forward_data(bat_priv, skb, ip_dst, vid, + BATADV_P_DAT_DHT_PUT); consume_skb(skb); diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 68c0ff321acd..110c27447d70 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -2,18 +2,6 @@ /* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index b506d15b8230..385fccdcf69d 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "fragmentation.h" diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index abdac26579bf..d6074ba2ada7 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -2,18 +2,6 @@ /* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll <martin@hundeboll.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_ diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index f5811f61aa92..47df4c678988 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "gateway_client.h" @@ -53,7 +41,6 @@ #include "originator.h" #include "routing.h" #include "soft-interface.h" -#include "sysfs.h" #include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index b5732c8be81a..0e14026feebd 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index e064de45e22c..dac097f9be03 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "gateway_common.h" diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 128467a0fb89..5cf50736c635 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_ diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 96ef7c70b4d9..79d1731b8306 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "hard-interface.h" diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 48de28c83401..c8ef6aa0e865 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 56a08ce193d5..a9d4e176f4de 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "hash.h" diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 37507b6d4006..ceef171f7f98 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -2,18 +2,6 @@ /* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_HASH_H_ diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 9859ababb82e..0a91c8661357 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "icmp_socket.h" @@ -77,7 +65,7 @@ static int batadv_socket_open(struct inode *inode, struct file *file) batadv_debugfs_deprecated(file, ""); - nonseekable_open(inode, file); + stream_open(inode, file); socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL); if (!socket_client) { diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 5f8926522ff0..35eecbfd2e65 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 3e610df8debf..f79ebd5b46e9 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "log.h" @@ -102,7 +90,7 @@ static int batadv_log_open(struct inode *inode, struct file *file) batadv_debugfs_deprecated(file, "Use tracepoint batadv:batadv_dbg instead\n"); - nonseekable_open(inode, file); + stream_open(inode, file); file->private_data = inode->i_private; return 0; } diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 660e9bcc85a2..5504637e63d8 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_LOG_H_ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 75750870cf04..4a89177def64 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" @@ -22,6 +10,7 @@ #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/crc32c.h> +#include <linux/device.h> #include <linux/errno.h> #include <linux/genetlink.h> #include <linux/gfp.h> @@ -31,6 +20,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kernel.h> +#include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/module.h> @@ -40,6 +30,7 @@ #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> @@ -84,6 +75,22 @@ struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); +#define BATADV_UEV_TYPE_VAR "BATTYPE=" +#define BATADV_UEV_ACTION_VAR "BATACTION=" +#define BATADV_UEV_DATA_VAR "BATDATA=" + +static char *batadv_uev_action_str[] = { + "add", + "del", + "change", + "loopdetect", +}; + +static char *batadv_uev_type_str[] = { + "gw", + "bla", +}; + static int __init batadv_init(void) { int ret; @@ -161,6 +168,7 @@ int batadv_mesh_init(struct net_device *soft_iface) spin_lock_init(&bat_priv->tt.commit_lock); spin_lock_init(&bat_priv->gw.list_lock); #ifdef CONFIG_BATMAN_ADV_MCAST + spin_lock_init(&bat_priv->mcast.mla_lock); spin_lock_init(&bat_priv->mcast.want_lists_lock); #endif spin_lock_init(&bat_priv->tvlv.container_list_lock); @@ -678,6 +686,60 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) return ap_isolation_enabled; } +/** + * batadv_throw_uevent() - Send an uevent with batman-adv specific env data + * @bat_priv: the bat priv with all the soft interface information + * @type: subsystem type of event. Stored in uevent's BATTYPE + * @action: action type of event. Stored in uevent's BATACTION + * @data: string with additional information to the event (ignored for + * BATADV_UEV_DEL). Stored in uevent's BATDATA + * + * Return: 0 on success or negative error number in case of failure + */ +int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, + enum batadv_uev_action action, const char *data) +{ + int ret = -ENOMEM; + struct kobject *bat_kobj; + char *uevent_env[4] = { NULL, NULL, NULL, NULL }; + + bat_kobj = &bat_priv->soft_iface->dev.kobj; + + uevent_env[0] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_TYPE_VAR, + batadv_uev_type_str[type]); + if (!uevent_env[0]) + goto out; + + uevent_env[1] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_ACTION_VAR, + batadv_uev_action_str[action]); + if (!uevent_env[1]) + goto out; + + /* If the event is DEL, ignore the data field */ + if (action != BATADV_UEV_DEL) { + uevent_env[2] = kasprintf(GFP_ATOMIC, + "%s%s", BATADV_UEV_DATA_VAR, data); + if (!uevent_env[2]) + goto out; + } + + ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); +out: + kfree(uevent_env[0]); + kfree(uevent_env[1]); + kfree(uevent_env[2]); + + if (ret) + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", + batadv_uev_type_str[type], + batadv_uev_action_str[action], + (action == BATADV_UEV_DEL ? "NULL" : data), ret); + return ret; +} + module_init(batadv_init); module_exit(batadv_exit); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3ed669d7dc6b..c59afcba31e0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_MAIN_H_ @@ -25,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2019.1" +#define BATADV_SOURCE_VERSION "2019.2" #endif /* B.A.T.M.A.N. parameters */ @@ -394,5 +382,7 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); +int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, + enum batadv_uev_action action, const char *data); #endif /* _NET_BATMAN_ADV_MAIN_H_ */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f91b1b6265cf..ec54e236e345 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -2,18 +2,6 @@ /* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "multicast.h" @@ -66,6 +54,7 @@ #include "hash.h" #include "log.h" #include "netlink.h" +#include "send.h" #include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -325,8 +314,6 @@ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. - * - * Do not call outside of the mcast worker! (or cancel mcast worker first) */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) @@ -334,8 +321,6 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; - WARN_ON(delayed_work_pending(&bat_priv->mcast.work)); - hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && @@ -359,8 +344,6 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. - * - * Do not call outside of the mcast worker! (or cancel mcast worker first) */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) @@ -368,8 +351,6 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; - WARN_ON(delayed_work_pending(&bat_priv->mcast.work)); - if (!mcast_list) return; @@ -658,7 +639,10 @@ static void batadv_mcast_mla_update(struct work_struct *work) priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); + spin_lock(&bat_priv->mcast.mla_lock); __batadv_mcast_mla_update(bat_priv); + spin_unlock(&bat_priv->mcast.mla_lock); + batadv_mcast_start_timer(bat_priv); } @@ -991,6 +975,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; + unsigned int mcast_fanout; struct ethhdr *ethhdr; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); @@ -1025,8 +1010,203 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, case 0: return BATADV_FORW_NONE; default: - return BATADV_FORW_ALL; + mcast_fanout = atomic_read(&bat_priv->multicast_fanout); + + if (!unsnoop_count && total_count <= mcast_fanout) + return BATADV_FORW_SOME; + } + + return BATADV_FORW_ALL; +} + +/** + * batadv_mcast_forw_tt() - forwards a packet to multicast listeners + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any multicast + * listener registered in the translation table. A transmission is performed + * via a batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + struct batadv_tt_orig_list_entry *orig_entry; + + struct batadv_tt_global_entry *tt_global; + const u8 *addr = eth_hdr(skb)->h_dest; + + tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); + if (!tt_global) + goto out; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_entry->orig_node, vid); + } + rcu_read_unlock(); + + batadv_tt_global_entry_put(tt_global); + +out: + return ret; +} + +/** + * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_ipv4_list, + mcast_want_all_ipv4_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 + * @bat_priv: the bat priv with all the soft interface information + * @skb: The multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a + * batman-adv unicast packet for each such destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS + * otherwise. + */ +static int +batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + struct batadv_orig_node *orig_node; + int ret = NET_XMIT_SUCCESS; + struct sk_buff *newskb; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, + &bat_priv->mcast.want_all_ipv6_list, + mcast_want_all_ipv6_node) { + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + ret = NET_XMIT_DROP; + break; + } + + batadv_send_skb_unicast(bat_priv, newskb, BATADV_UNICAST, 0, + orig_node, vid); + } + rcu_read_unlock(); + return ret; +} + +/** + * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node with a + * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A + * transmission is performed via a batman-adv unicast packet for each such + * destination node. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +static int +batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) +{ + switch (ntohs(eth_hdr(skb)->h_proto)) { + case ETH_P_IP: + return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); + case ETH_P_IPV6: + return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); + default: + /* we shouldn't be here... */ + return NET_XMIT_DROP; + } +} + +/** + * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * @bat_priv: the bat priv with all the soft interface information + * @skb: the multicast packet to transmit + * @vid: the vlan identifier + * + * Sends copies of a frame with multicast destination to any node that signaled + * interest in it, that is either via the translation table or the according + * want-all flags. A transmission is performed via a batman-adv unicast packet + * for each such destination node. + * + * The given skb is consumed/freed. + * + * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family + * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. + */ +int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + int ret; + + ret = batadv_mcast_forw_tt(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + + ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); + if (ret != NET_XMIT_SUCCESS) { + kfree_skb(skb); + return ret; + } + + consume_skb(skb); + return ret; } /** diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 466013fe88af..653b9b76fabe 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -2,18 +2,6 @@ /* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_MULTICAST_H_ @@ -36,6 +24,13 @@ enum batadv_forw_mode { BATADV_FORW_ALL, /** + * @BATADV_FORW_SOME: forward the packet to some nodes (currently via + * a multicast-to-unicast conversion and the BATMAN unicast routing + * protocol) + */ + BATADV_FORW_SOME, + + /** * @BATADV_FORW_SINGLE: forward the packet to a single node (currently * via the BATMAN unicast routing protocol) */ @@ -51,6 +46,9 @@ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **mcast_single_orig); +int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid); + void batadv_mcast_init(struct batadv_priv *bat_priv); int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); @@ -73,6 +71,14 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, return BATADV_FORW_ALL; } +static inline int +batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} + static inline int batadv_mcast_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 67a58da2e6a0..a67720fad46c 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -2,18 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "netlink.h" @@ -157,6 +145,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, @@ -353,6 +342,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, !atomic_read(&bat_priv->multicast_mode))) goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT, + atomic_read(&bat_priv->multicast_fanout))) + goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC @@ -592,6 +585,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); } + + if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) { + attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT]; + + atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr)); + } #endif /* CONFIG_BATMAN_ADV_MCAST */ #ifdef CONFIG_BATMAN_ADV_NC @@ -1344,35 +1343,35 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ - .policy = batadv_netlink_policy, .doit = batadv_netlink_get_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .doit = batadv_netlink_tp_meter_start, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER_CANCEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .doit = batadv_netlink_tp_meter_cancel, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_GET_ROUTING_ALGOS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_algo_dump, }, { .cmd = BATADV_CMD_GET_HARDIF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ - .policy = batadv_netlink_policy, .dumpit = batadv_netlink_dump_hardif, .doit = batadv_netlink_get_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | @@ -1380,85 +1379,85 @@ static const struct genl_ops batadv_netlink_ops[] = { }, { .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_tt_local_dump, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_tt_global_dump, }, { .cmd = BATADV_CMD_GET_ORIGINATORS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_orig_dump, }, { .cmd = BATADV_CMD_GET_NEIGHBORS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_hardif_neigh_dump, }, { .cmd = BATADV_CMD_GET_GATEWAYS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_gw_dump, }, { .cmd = BATADV_CMD_GET_BLA_CLAIM, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_bla_claim_dump, }, { .cmd = BATADV_CMD_GET_BLA_BACKBONE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_bla_backbone_dump, }, { .cmd = BATADV_CMD_GET_DAT_CACHE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_dat_cache_dump, }, { .cmd = BATADV_CMD_GET_MCAST_FLAGS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .dumpit = batadv_mcast_flags_dump, }, { .cmd = BATADV_CMD_SET_MESH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .doit = batadv_netlink_set_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_SET_HARDIF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .doit = batadv_netlink_set_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_VLAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ - .policy = batadv_netlink_policy, .doit = batadv_netlink_get_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, }, { .cmd = BATADV_CMD_SET_VLAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = batadv_netlink_policy, .doit = batadv_netlink_set_vlan, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_VLAN, @@ -1470,6 +1469,7 @@ struct genl_family batadv_netlink_family __ro_after_init = { .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, + .policy = batadv_netlink_policy, .netnsok = true, .pre_doit = batadv_pre_doit, .post_doit = batadv_post_doit, diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 7273368544fc..d1e0681b8743 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -2,18 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_NETLINK_H_ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 278762bd94c6..c5e7906045f3 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "network-coding.h" diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 96ef0a511fc7..74f56113a5d0 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index e5cdf89ef63c..45db798a7297 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -2,18 +2,6 @@ /* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "originator.h" diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index dca1e4a34ec6..3829e26f9c5d 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index cae0e5dd0768..f0f864820dea 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "routing.h" diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 0102d69d345c..b96c6d06d188 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_ROUTING_H_ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 66a8b3e44501..3ce5f7bad369 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "send.h" diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 1f6132922e60..5921ee4e107c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SEND_H_ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 2e367230376b..a7677e1d000f 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "soft-interface.h" @@ -209,7 +197,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, unsigned short vid; u32 seqno; int gw_mode; - enum batadv_forw_mode forw_mode; + enum batadv_forw_mode forw_mode = BATADV_FORW_SINGLE; struct batadv_orig_node *mcast_single_orig = NULL; int network_offset = ETH_HLEN; __be16 proto; @@ -317,7 +305,8 @@ send: if (forw_mode == BATADV_FORW_NONE) goto dropped; - if (forw_mode == BATADV_FORW_SINGLE) + if (forw_mode == BATADV_FORW_SINGLE || + forw_mode == BATADV_FORW_SOME) do_bcast = false; } } @@ -377,6 +366,8 @@ send: ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, mcast_single_orig, vid); + } else if (forw_mode == BATADV_FORW_SOME) { + ret = batadv_mcast_forw_send(bat_priv, skb, vid); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) @@ -818,6 +809,7 @@ static int batadv_softif_init_late(struct net_device *dev) bat_priv->mcast.querier_ipv6.shadowing = false; bat_priv->mcast.flags = BATADV_NO_FLAGS; atomic_set(&bat_priv->multicast_mode, 1); + atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 538bb661878c..275442a7acb6 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 0b4b3fb778a6..80fc3253c336 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -2,23 +2,12 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "sysfs.h" #include "main.h" +#include <asm/current.h> #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/device.h> @@ -34,6 +23,7 @@ #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> @@ -52,6 +42,16 @@ #include "network-coding.h" #include "soft-interface.h" +/** + * batadv_sysfs_deprecated() - Log use of deprecated batadv sysfs access + * @attr: attribute which was accessed + */ +static void batadv_sysfs_deprecated(struct attribute *attr) +{ + pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of sysfs file \"%s\".\nUse batadv genl family instead", + current->comm, task_pid_nr(current), attr->name); +} + static struct net_device *batadv_kobj_to_netdev(struct kobject *obj) { struct device *dev = container_of(obj->parent, struct device, kobj); @@ -114,22 +114,6 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) return vlan; } -#define BATADV_UEV_TYPE_VAR "BATTYPE=" -#define BATADV_UEV_ACTION_VAR "BATACTION=" -#define BATADV_UEV_DATA_VAR "BATDATA=" - -static char *batadv_uev_action_str[] = { - "add", - "del", - "change", - "loopdetect", -}; - -static char *batadv_uev_type_str[] = { - "gw", - "bla", -}; - /* Use this, if you have customized show and store functions for vlan attrs */ #define BATADV_ATTR_VLAN(_name, _mode, _show, _store) \ struct batadv_attribute batadv_attr_vlan_##_name = { \ @@ -157,6 +141,7 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct batadv_priv *bat_priv = netdev_priv(net_dev); \ ssize_t length; \ \ + batadv_sysfs_deprecated(attr); \ length = __batadv_store_bool_attr(buff, count, _post_func, attr,\ &bat_priv->_name, net_dev); \ \ @@ -171,6 +156,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ \ + batadv_sysfs_deprecated(attr); \ return sprintf(buff, "%s\n", \ atomic_read(&bat_priv->_name) == 0 ? \ "disabled" : "enabled"); \ @@ -194,6 +180,7 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct batadv_priv *bat_priv = netdev_priv(net_dev); \ ssize_t length; \ \ + batadv_sysfs_deprecated(attr); \ length = __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ &bat_priv->_var, net_dev, \ @@ -210,6 +197,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ { \ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \ \ + batadv_sysfs_deprecated(attr); \ return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \ } \ @@ -234,6 +222,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \ attr, &vlan->_name, \ bat_priv->soft_iface); \ \ + batadv_sysfs_deprecated(attr); \ if (vlan->vid) \ batadv_netlink_notify_vlan(bat_priv, vlan); \ else \ @@ -254,6 +243,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \ atomic_read(&vlan->_name) == 0 ? \ "disabled" : "enabled"); \ \ + batadv_sysfs_deprecated(attr); \ batadv_softif_vlan_put(vlan); \ return res; \ } @@ -275,6 +265,7 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ struct batadv_priv *bat_priv; \ ssize_t length; \ \ + batadv_sysfs_deprecated(attr); \ hard_iface = batadv_hardif_get_by_netdev(net_dev); \ if (!hard_iface) \ return 0; \ @@ -302,6 +293,7 @@ ssize_t batadv_show_##_name(struct kobject *kobj, \ struct batadv_hard_iface *hard_iface; \ ssize_t length; \ \ + batadv_sysfs_deprecated(attr); \ hard_iface = batadv_hardif_get_by_netdev(net_dev); \ if (!hard_iface) \ return 0; \ @@ -446,6 +438,7 @@ static ssize_t batadv_show_bat_algo(struct kobject *kobj, { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + batadv_sysfs_deprecated(attr); return sprintf(buff, "%s\n", bat_priv->algo_ops->name); } @@ -462,6 +455,8 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); int bytes_written; + batadv_sysfs_deprecated(attr); + /* GW mode is not available if the routing algorithm in use does not * implement the GW API */ @@ -496,6 +491,8 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, char *curr_gw_mode_str; int gw_mode_tmp = -1; + batadv_sysfs_deprecated(attr); + /* toggling GW mode is allowed only if the routing algorithm in use * provides the GW API */ @@ -570,6 +567,8 @@ static ssize_t batadv_show_gw_sel_class(struct kobject *kobj, { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + batadv_sysfs_deprecated(attr); + /* GW selection class is not available if the routing algorithm in use * does not implement the GW API */ @@ -590,6 +589,8 @@ static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); ssize_t length; + batadv_sysfs_deprecated(attr); + /* setting the GW selection class is allowed only if the routing * algorithm in use implements the GW API */ @@ -620,6 +621,8 @@ static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); u32 down, up; + batadv_sysfs_deprecated(attr); + down = atomic_read(&bat_priv->gw.bandwidth_down); up = atomic_read(&bat_priv->gw.bandwidth_up); @@ -635,6 +638,8 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj, struct net_device *net_dev = batadv_kobj_to_netdev(kobj); ssize_t length; + batadv_sysfs_deprecated(attr); + if (buff[count - 1] == '\n') buff[count - 1] = '\0'; @@ -659,6 +664,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj, { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + batadv_sysfs_deprecated(attr); return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark, bat_priv->isolation_mark_mask); } @@ -682,6 +688,8 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, u32 mark, mask; char *mask_ptr; + batadv_sysfs_deprecated(attr); + /* parse the mask if it has been specified, otherwise assume the mask is * the biggest possible */ @@ -937,6 +945,8 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, ssize_t length; const char *ifname; + batadv_sysfs_deprecated(attr); + hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) return 0; @@ -1041,6 +1051,8 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct batadv_store_mesh_work *store_work; + batadv_sysfs_deprecated(attr); + if (buff[count - 1] == '\n') buff[count - 1] = '\0'; @@ -1072,6 +1084,8 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj, struct batadv_hard_iface *hard_iface; ssize_t length; + batadv_sysfs_deprecated(attr); + hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) return 0; @@ -1116,13 +1130,15 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, struct attribute *attr, char *buff, size_t count) { - struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv; u32 tp_override; u32 old_tp_override; bool ret; + batadv_sysfs_deprecated(attr); + hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) return -EINVAL; @@ -1147,7 +1163,10 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, atomic_set(&hard_iface->bat_v.throughput_override, tp_override); - batadv_netlink_notify_hardif(bat_priv, hard_iface); + if (hard_iface->soft_iface) { + bat_priv = netdev_priv(hard_iface->soft_iface); + batadv_netlink_notify_hardif(bat_priv, hard_iface); + } out: batadv_hardif_put(hard_iface); @@ -1162,6 +1181,8 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj, struct batadv_hard_iface *hard_iface; u32 tp_override; + batadv_sysfs_deprecated(attr); + hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) return -EINVAL; @@ -1247,57 +1268,3 @@ void batadv_sysfs_del_hardif(struct kobject **hardif_obj) kobject_put(*hardif_obj); *hardif_obj = NULL; } - -/** - * batadv_throw_uevent() - Send an uevent with batman-adv specific env data - * @bat_priv: the bat priv with all the soft interface information - * @type: subsystem type of event. Stored in uevent's BATTYPE - * @action: action type of event. Stored in uevent's BATACTION - * @data: string with additional information to the event (ignored for - * BATADV_UEV_DEL). Stored in uevent's BATDATA - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, - enum batadv_uev_action action, const char *data) -{ - int ret = -ENOMEM; - struct kobject *bat_kobj; - char *uevent_env[4] = { NULL, NULL, NULL, NULL }; - - bat_kobj = &bat_priv->soft_iface->dev.kobj; - - uevent_env[0] = kasprintf(GFP_ATOMIC, - "%s%s", BATADV_UEV_TYPE_VAR, - batadv_uev_type_str[type]); - if (!uevent_env[0]) - goto out; - - uevent_env[1] = kasprintf(GFP_ATOMIC, - "%s%s", BATADV_UEV_ACTION_VAR, - batadv_uev_action_str[action]); - if (!uevent_env[1]) - goto out; - - /* If the event is DEL, ignore the data field */ - if (action != BATADV_UEV_DEL) { - uevent_env[2] = kasprintf(GFP_ATOMIC, - "%s%s", BATADV_UEV_DATA_VAR, data); - if (!uevent_env[2]) - goto out; - } - - ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); -out: - kfree(uevent_env[0]); - kfree(uevent_env[1]); - kfree(uevent_env[2]); - - if (ret) - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", - batadv_uev_type_str[type], - batadv_uev_action_str[action], - (action == BATADV_UEV_DEL ? "NULL" : data), ret); - return ret; -} diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 705ffbe763f4..83fa808b1871 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_SYSFS_H_ @@ -57,6 +45,8 @@ struct batadv_attribute { char *buf, size_t count); }; +#ifdef CONFIG_BATMAN_ADV_SYSFS + int batadv_sysfs_add_meshif(struct net_device *dev); void batadv_sysfs_del_meshif(struct net_device *dev); int batadv_sysfs_add_hardif(struct kobject **hardif_obj, @@ -66,7 +56,39 @@ int batadv_sysfs_add_vlan(struct net_device *dev, struct batadv_softif_vlan *vlan); void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan); -int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, - enum batadv_uev_action action, const char *data); + +#else + +static inline int batadv_sysfs_add_meshif(struct net_device *dev) +{ + return 0; +} + +static inline void batadv_sysfs_del_meshif(struct net_device *dev) +{ +} + +static inline int batadv_sysfs_add_hardif(struct kobject **hardif_obj, + struct net_device *dev) +{ + return 0; +} + +static inline void batadv_sysfs_del_hardif(struct kobject **hardif_obj) +{ +} + +static inline int batadv_sysfs_add_vlan(struct net_device *dev, + struct batadv_softif_vlan *vlan) +{ + return 0; +} + +static inline void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan) +{ +} + +#endif #endif /* _NET_BATMAN_ADV_SYSFS_H_ */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 500109bbd551..820392146249 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "tp_meter.h" diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 6b4d0f733896..604b3799c972 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -2,18 +2,6 @@ /* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TP_METER_H_ diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index f77c917ed20d..3cedd2c36528 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define CREATE_TRACE_POINTS diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 5e5579051400..d8f764521c0b 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -2,18 +2,6 @@ /* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index f73d79139ae7..1ddfd5e011ee 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "translation-table.h" @@ -205,7 +193,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, * Return: a pointer to the corresponding tt_global_entry struct if the client * is found, NULL otherwise. */ -static struct batadv_tt_global_entry * +struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { @@ -300,8 +288,7 @@ static void batadv_tt_global_entry_release(struct kref *ref) * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ -static void -batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) +void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); @@ -616,14 +603,26 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, struct batadv_tt_global_entry *tt_global, const char *message) { + struct batadv_tt_global_entry *tt_removed_entry; + struct hlist_node *tt_removed_node; + batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, batadv_print_vid(tt_global->common.vid), message); - batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, - batadv_choose_tt, &tt_global->common); - batadv_tt_global_entry_put(tt_global); + tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash, + batadv_compare_tt, + batadv_choose_tt, + &tt_global->common); + if (!tt_removed_node) + return; + + /* drop reference of remove hash entry */ + tt_removed_entry = hlist_entry(tt_removed_node, + struct batadv_tt_global_entry, + common.hash_entry); + batadv_tt_global_entry_put(tt_removed_entry); } /** @@ -1337,9 +1336,10 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming) { + struct batadv_tt_local_entry *tt_removed_entry; struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; - void *tt_entry_exists; + struct hlist_node *tt_removed_node; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); if (!tt_local_entry) @@ -1368,15 +1368,18 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, */ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL); - tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash, + tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local_entry->common); - if (!tt_entry_exists) + if (!tt_removed_node) goto out; - /* extra call to free the local tt entry */ - batadv_tt_local_entry_put(tt_local_entry); + /* drop reference of remove hash entry */ + tt_removed_entry = hlist_entry(tt_removed_node, + struct batadv_tt_local_entry, + common.hash_entry); + batadv_tt_local_entry_put(tt_removed_entry); out: if (tt_local_entry) diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 61bca75e5911..c8c48d62a430 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ @@ -41,6 +29,10 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); +struct batadv_tt_global_entry * +batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, + unsigned short vid); +void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 7e947b01919d..aae63f0d21eb 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "main.h" diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index c0f033b1acb8..114ac01e06af 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TVLV_H_ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a21b34ed6548..74b644738a36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #ifndef _NET_BATMAN_ADV_TYPES_H_ @@ -1224,6 +1212,11 @@ struct batadv_priv_mcast { unsigned char bridged:1; /** + * @mla_lock: a lock protecting mla_list and mla_flags + */ + spinlock_t mla_lock; + + /** * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP * traffic */ @@ -1565,6 +1558,12 @@ struct batadv_priv { * node's sender/originating side */ atomic_t multicast_mode; + + /** + * @multicast_fanout: Maximum number of packet copies to generate for a + * multicast-to-unicast conversion + */ + atomic_t multicast_fanout; #endif /** @orig_interval: OGM broadcast interval in milliseconds */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 8d12198eaa94..94ddf19998c7 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -521,14 +521,6 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = put_user(amount, (int __user *) arg); break; - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *) arg); - break; - - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *) arg); - break; - default: err = -ENOIOCTLCMD; break; diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 78bec8df8525..aaa39409eeb7 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -161,7 +161,6 @@ static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output) } shash->tfm = tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ret = crypto_shash_digest(shash, plaintext, psize, output); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index bd4978ce8c45..3cf0764d5793 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1276,6 +1276,14 @@ int hci_conn_check_link_mode(struct hci_conn *conn) !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; + /* The minimum encryption key size needs to be enforced by the + * host stack before establishing any L2CAP connections. The + * specification in theory allows a minimum of 1, but to align + * BR/EDR and LE transports, a minimum of 7 is chosen. + */ + if (conn->enc_key_size < HCI_MIN_ENC_KEY_SIZE) + return 0; + return 1; } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index d6b2540ba7f8..b81bf53c5ac4 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1460,8 +1460,6 @@ static int hci_dev_do_open(struct hci_dev *hdev) hdev->set_bdaddr) ret = hdev->set_bdaddr(hdev, &hdev->public_addr); - else - ret = -EADDRNOTAVAIL; } setup_failed: @@ -4383,6 +4381,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, return; } + /* If we reach this point this event matches the last command sent */ + hci_dev_clear_flag(hdev, HCI_CMD_PENDING); + /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ @@ -4493,6 +4494,8 @@ static void hci_cmd_work(struct work_struct *work) hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (hdev->sent_cmd) { + if (hci_req_status_pend(hdev)) + hci_dev_set_flag(hdev, HCI_CMD_PENDING); atomic_dec(&hdev->cmd_cnt); hci_send_frame(hdev, skb); if (test_bit(HCI_RESET, &hdev->flags)) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 609fd6871c5a..9e4fcf406d9c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3404,6 +3404,12 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_req_cmd_complete(hdev, *opcode, *status, req_complete, req_complete_skb); + if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) { + bt_dev_err(hdev, + "unexpected event for opcode 0x%4.4x", *opcode); + return; + } + if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q)) queue_work(hdev->workqueue, &hdev->cmd_work); } @@ -3511,6 +3517,12 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete, req_complete_skb); + if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) { + bt_dev_err(hdev, + "unexpected event for opcode 0x%4.4x", *opcode); + return; + } + if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q)) queue_work(hdev->workqueue, &hdev->cmd_work); } @@ -5433,7 +5445,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) ev->data, ev->length); } - ptr += sizeof(*ev) + ev->length + 1; + ptr += sizeof(*ev) + ev->length; } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index ca73d36cc149..e9a95ed65491 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -46,6 +46,11 @@ void hci_req_purge(struct hci_request *req) skb_queue_purge(&req->cmd_q); } +bool hci_req_status_pend(struct hci_dev *hdev) +{ + return hdev->req_status == HCI_REQ_PEND; +} + static int req_run(struct hci_request *req, hci_req_complete_t complete, hci_req_complete_skb_t complete_skb) { diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 692cc8b13368..55b2050cc9ff 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -37,6 +37,7 @@ struct hci_request { void hci_req_init(struct hci_request *req, struct hci_dev *hdev); void hci_req_purge(struct hci_request *req); +bool hci_req_status_pend(struct hci_dev *hdev); int hci_req_run(struct hci_request *req, hci_req_complete_t complete); int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete); void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 9f85a1943be9..2151913892ce 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -75,6 +75,7 @@ static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user sockfd_put(csock); return err; } + ca.name[sizeof(ca.name)-1] = 0; err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca, sizeof(ca))) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index f17e393b43b4..b53acd6c9a3d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -510,12 +510,12 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) } EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults); -static void l2cap_le_flowctl_init(struct l2cap_chan *chan) +static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits) { chan->sdu = NULL; chan->sdu_last_frag = NULL; chan->sdu_len = 0; - chan->tx_credits = 0; + chan->tx_credits = tx_credits; /* Derive MPS from connection MTU to stop HCI fragmentation */ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE); /* Give enough credits for a full packet */ @@ -1281,7 +1281,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan) if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags)) return; - l2cap_le_flowctl_init(chan); + l2cap_le_flowctl_init(chan, 0); req.psm = chan->psm; req.scid = cpu_to_le16(chan->scid); @@ -5532,11 +5532,10 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, chan->dcid = scid; chan->omtu = mtu; chan->remote_mps = mps; - chan->tx_credits = __le16_to_cpu(req->credits); __l2cap_chan_add(conn, chan); - l2cap_le_flowctl_init(chan); + l2cap_le_flowctl_init(chan, __le16_to_cpu(req->credits)); dcid = chan->scid; credits = chan->rx_credits; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a3a2cd55e23a..a7be8b59b3c2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -791,10 +791,13 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, conn = chan->conn; - /*change security for LE channels */ + /* change security for LE channels */ if (chan->scid == L2CAP_CID_ATT) { - if (smp_conn_security(conn->hcon, sec.level)) + if (smp_conn_security(conn->hcon, sec.level)) { + err = -EINVAL; break; + } + set_bit(FLAG_PENDING_SECURITY, &chan->flags); sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; @@ -1655,6 +1658,7 @@ static const struct proto_ops l2cap_sock_ops = { .recvmsg = l2cap_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, + .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = l2cap_sock_shutdown, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 2457f408d17d..150114e33b20 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2301,8 +2301,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + key_count * - sizeof(struct mgmt_link_key_info); + expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes", expected_len, len); @@ -5030,7 +5029,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info); + expected_len = struct_size(cp, irks, irk_count); if (expected_len != len) { bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes", expected_len, len); @@ -5112,8 +5111,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + key_count * - sizeof(struct mgmt_ltk_info); + expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes", expected_len, len); @@ -5847,8 +5845,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - expected_len = sizeof(*cp) + param_count * - sizeof(struct mgmt_conn_param); + expected_len = struct_size(cp, params, param_count); if (expected_len != len) { bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes", expected_len, len); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b1f49fcc0478..90bb53aa4bee 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1039,6 +1039,7 @@ static const struct proto_ops rfcomm_sock_ops = { .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, + .gettstamp = sock_gettstamp, .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9a580999ca57..b91d6b440fdf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -523,12 +523,12 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, struct sock *sk = sock->sk; int err = 0; - BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); - if (!addr || addr_len < sizeof(struct sockaddr_sco) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; + BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr); + lock_sock(sk); if (sk->sk_state != BT_OPEN) { @@ -1190,6 +1190,7 @@ static const struct proto_ops sco_sock_ops = { .recvmsg = sco_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, + .gettstamp = sock_gettstamp, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = sco_sock_shutdown, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 621146d04c03..e68c715f8d37 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -183,7 +183,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m, } desc->tfm = tfm; - desc->flags = 0; /* Swap key and message from LSB to MSB */ swap_buf(k, tmp, 16); diff --git a/net/bpf/Makefile b/net/bpf/Makefile index 27b2992a0692..b0ca361742e4 100644 --- a/net/bpf/Makefile +++ b/net/bpf/Makefile @@ -1 +1 @@ -obj-y := test_run.o +obj-$(CONFIG_BPF_SYSCALL) := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fab142b796ef..33e0dc168c16 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,9 +10,13 @@ #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/sched/signal.h> +#include <net/bpf_sk_storage.h> #include <net/sock.h> #include <net/tcp.h> +#define CREATE_TRACE_POINTS +#include <trace/events/bpf_test_run.h> + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time) { @@ -100,6 +104,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, if (err != -ENOSPC) err = 0; out: + trace_bpf_test_finish(&err); return err; } @@ -123,12 +128,126 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, return data; } +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + + FIELD_SIZEOF(struct __sk_buff, priority), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) + + FIELD_SIZEOF(struct __sk_buff, cb), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->priority = __skb->priority; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->priority = skb->priority; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); +} + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; @@ -141,6 +260,12 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(data)) return PTR_ERR(data); + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: @@ -158,6 +283,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, sk = kzalloc(sizeof(struct sock), GFP_USER); if (!sk) { kfree(data); + kfree(ctx); return -ENOMEM; } sock_net_set(sk, current->nsproxy->net_ns); @@ -166,6 +292,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb = build_skb(data, 0); if (!skb) { kfree(data); + kfree(ctx); kfree(sk); return -ENOMEM; } @@ -180,32 +307,38 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; ret = bpf_test_run(prog, skb, repeat, &retval, &duration); - if (ret) { - kfree_skb(skb); - kfree(sk); - return ret; - } + if (ret) + goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { - kfree_skb(skb); - kfree(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } + convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: kfree_skb(skb); + bpf_sk_storage_free(sk); kfree(sk); + kfree(ctx); return ret; } @@ -220,6 +353,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -249,13 +385,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, union bpf_attr __user *uattr) { u32 size = kattr->test.data_size_in; + struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys flow_keys; u64 time_start, time_spent = 0; - struct bpf_skb_data_end *cb; + const struct ethhdr *eth; u32 retval, duration; - struct sk_buff *skb; - struct sock *sk; void *data; int ret; u32 i; @@ -263,46 +398,31 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; - data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); - if (IS_ERR(data)) - return PTR_ERR(data); - - sk = kzalloc(sizeof(*sk), GFP_USER); - if (!sk) { - kfree(data); - return -ENOMEM; - } - sock_net_set(sk, current->nsproxy->net_ns); - sock_init_data(NULL, sk); + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; - skb = build_skb(data, 0); - if (!skb) { - kfree(data); - kfree(sk); - return -ENOMEM; - } - skb->sk = sk; + if (size < ETH_HLEN) + return -EINVAL; - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - __skb_put(skb, size); - skb->protocol = eth_type_trans(skb, - current->nsproxy->net_ns->loopback_dev); - skb_reset_network_header(skb); + data = bpf_test_init(kattr, size, 0, 0); + if (IS_ERR(data)) + return PTR_ERR(data); - cb = (struct bpf_skb_data_end *)skb->cb; - cb->qdisc_cb.flow_keys = &flow_keys; + eth = (struct ethhdr *)data; if (!repeat) repeat = 1; + ctx.flow_keys = &flow_keys; + ctx.data = data; + ctx.data_end = (__u8 *)data + size; + rcu_read_lock(); preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - retval = __skb_flow_bpf_dissect(prog, skb, - &flow_keys_dissector, - &flow_keys); + retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, + size); if (signal_pending(current)) { preempt_enable(); @@ -335,7 +455,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, retval, duration); out: - kfree_skb(skb); - kfree(sk); + kfree(data); return ret; } diff --git a/net/bridge/br.c b/net/bridge/br.c index a5174e5001d8..3c8e4b38f054 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -40,10 +40,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v bool changed_addr; int err; - /* register of bridge completed, add sysfs entries */ - if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) { - br_sysfs_addbr(dev); - return NOTIFY_DONE; + if (dev->priv_flags & IFF_EBRIDGE) { + if (event == NETDEV_REGISTER) { + /* register of bridge completed, add sysfs entries */ + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + br_vlan_bridge_event(dev, event, ptr); } /* not a port of a bridge */ @@ -126,6 +129,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; } + if (event != NETDEV_UNREGISTER) + br_vlan_port_event(p, event); + /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 6b78e6351719..15116752365a 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -21,6 +21,7 @@ #include <linux/if_vlan.h> #include <linux/inetdevice.h> #include <net/addrconf.h> +#include <net/ipv6_stubs.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_checksum.h> #endif @@ -130,7 +131,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u8 *arpptr, *sha; __be32 sip, tip; - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if ((dev->flags & IFF_NOARP) || !pskb_may_pull(skb, arp_hdr_len(dev))) @@ -160,7 +161,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, return; if (ipv4_is_zeronet(sip) || sip == tip) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } } @@ -180,7 +181,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, /* its our local ip, so don't proxy reply * and don't forward to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -216,7 +217,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); @@ -392,7 +393,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, struct ipv6hdr *iphdr; struct neighbour *n; - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if (p && (p->flags & BR_NEIGH_SUPPRESS)) return; @@ -400,7 +401,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -413,7 +414,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { /* prevent flooding to neigh suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -431,7 +432,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, /* its our own ip, so don't proxy reply * and don't forward to arp suppress ports */ - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } @@ -464,7 +465,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 00573cc46c98..b1c91f66d79c 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -33,7 +33,6 @@ static const struct rhashtable_params br_fdb_rht_params = { .key_offset = offsetof(struct net_bridge_fdb_entry, key), .key_len = sizeof(struct net_bridge_fdb_key), .automatic_shrinking = true, - .locks_mul = 1, }; static struct kmem_cache *br_fdb_cache __read_mostly; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 48ddc60b4fbd..82225b8b54f5 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -173,6 +173,7 @@ static struct net_bridge_port *maybe_deliver( struct net_bridge_port *prev, struct net_bridge_port *p, struct sk_buff *skb, bool local_orig) { + u8 igmp_type = br_multicast_igmp_type(skb); int err; if (!should_deliver(p, skb)) @@ -184,8 +185,9 @@ static struct net_bridge_port *maybe_deliver( err = deliver_clone(prev, skb, local_orig); if (err) return ERR_PTR(err); - out: + br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX); + return p; } @@ -193,7 +195,6 @@ out: void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig) { - u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge_port *prev = NULL; struct net_bridge_port *p; @@ -226,9 +227,6 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, prev = maybe_deliver(prev, p, skb, local_orig); if (IS_ERR(prev)) goto out; - if (prev == p) - br_multicast_count(p->br, p, skb, igmp_type, - BR_MCAST_DIR_TX); } if (!prev) @@ -277,7 +275,6 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, bool local_rcv, bool local_orig) { struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; @@ -304,13 +301,9 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, } prev = maybe_deliver(prev, port, skb, local_orig); -delivered: if (IS_ERR(prev)) goto out; - if (prev == port) - br_multicast_count(port->br, port, skb, igmp_type, - BR_MCAST_DIR_TX); - +delivered: if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); if ((unsigned long)rport >= (unsigned long)port) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 41f0a696a65f..6d4a24a7534b 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -179,7 +179,7 @@ int nbp_backup_change(struct net_bridge_port *p, ASSERT_RTNL(); if (backup_dev) { - if (!br_port_exists(backup_dev)) + if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); @@ -602,13 +602,15 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); - if (err) - goto put_back; + if (err) { + kfree(p); /* kobject not yet init'd, manually free */ + goto err1; + } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) - goto err1; + goto err2; err = br_sysfs_addif(p); if (err) @@ -700,12 +702,9 @@ err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: kobject_put(&p->kobj); - p = NULL; /* kobject_put frees */ -err1: dev_set_allmulti(dev, -1); -put_back: +err1: dev_put(dev); - kfree(p); return err; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 5ea7e56119c1..014af7efef25 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -16,6 +16,9 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netfilter_bridge.h> +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +#include <net/netfilter/nf_queue.h> +#endif #include <linux/neighbour.h> #include <net/arp.h> #include <linux/export.h> @@ -23,10 +26,6 @@ #include "br_private.h" #include "br_private_tunnel.h" -/* Hook for brouter */ -br_should_route_hook_t __rcu *br_should_route_hook __read_mostly; -EXPORT_SYMBOL(br_should_route_hook); - static int br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -197,13 +196,63 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_bridge_port *p = br_port_get_rcu(skb->dev); - __br_handle_local_finish(skb); - BR_INPUT_SKB_CB(skb)->brdev = p->br->dev; - br_pass_frame_up(skb); - return 0; + /* return 1 to signal the okfn() was called so it's ok to use the skb */ + return 1; +} + +static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) +{ +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + struct nf_hook_entries *e = NULL; + struct nf_hook_state state; + unsigned int verdict, i; + struct net *net; + int ret; + + net = dev_net(skb->dev); +#ifdef HAVE_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING])) + goto frame_finish; +#endif + + e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); + if (!e) + goto frame_finish; + + nf_hook_state_init(&state, NF_BR_PRE_ROUTING, + NFPROTO_BRIDGE, skb->dev, NULL, NULL, + net, br_handle_frame_finish); + + for (i = 0; i < e->num_hook_entries; i++) { + verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state); + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) { + *pskb = skb; + return RX_HANDLER_PASS; + } + break; + case NF_DROP: + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + case NF_QUEUE: + ret = nf_queue(skb, &state, e, i, verdict); + if (ret == 1) + continue; + return RX_HANDLER_CONSUMED; + default: /* STOLEN */ + return RX_HANDLER_CONSUMED; + } + } +frame_finish: + net = dev_net(skb->dev); + br_handle_frame_finish(net, NULL, skb); +#else + br_handle_frame_finish(dev_net(skb->dev), NULL, skb); +#endif + return RX_HANDLER_CONSUMED; } /* @@ -215,7 +264,6 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; - br_should_route_hook_t *rhook; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; @@ -227,6 +275,8 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) if (!skb) return RX_HANDLER_CONSUMED; + memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); + p = br_port_get_rcu(skb->dev); if (p->flags & BR_VLAN_TUNNEL) { if (br_handle_ingress_vlan_tunnel(skb, p, @@ -280,32 +330,28 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } - /* Deliver packet to local host only */ - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), - NULL, skb, skb->dev, NULL, br_handle_local_finish); - return RX_HANDLER_CONSUMED; + /* The else clause should be hit when nf_hook(): + * - returns < 0 (drop/error) + * - returns = 0 (stolen/nf_queue) + * Thus return 1 from the okfn() to signal the skb is ok to pass + */ + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + br_handle_local_finish) == 1) { + return RX_HANDLER_PASS; + } else { + return RX_HANDLER_CONSUMED; + } } forward: switch (p->state) { case BR_STATE_FORWARDING: - rhook = rcu_dereference(br_should_route_hook); - if (rhook) { - if ((*rhook)(skb)) { - *pskb = skb; - return RX_HANDLER_PASS; - } - dest = eth_hdr(skb)->h_dest; - } - /* fall through */ case BR_STATE_LEARNING: if (ether_addr_equal(p->br->dev->dev_addr, dest)) skb->pkt_type = PACKET_HOST; - NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - br_handle_frame_finish); - break; + return nf_hook_bridge_pre(skb, pskb); default: drop: kfree_skb(skb); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index f69c8d91dc81..bf6acd34234d 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -26,14 +26,14 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, if (!br->multicast_router || hlist_empty(&br->router_list)) return 0; - nest = nla_nest_start(skb, MDBA_ROUTER); + nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; hlist_for_each_entry_rcu(p, &br->router_list, rlist) { if (!p) continue; - port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT); + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); if (!port_nest) goto fail; if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) || @@ -86,7 +86,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; - nest = nla_nest_start(skb, MDBA_MDB); + nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) return -EMSGSIZE; @@ -98,7 +98,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, if (idx < s_idx) goto skip; - nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); + nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (!nest2) { err = -EMSGSIZE; break; @@ -124,7 +124,8 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, e.addr.u.ip6 = p->addr.u.ip6; #endif e.addr.proto = p->addr.proto; - nest_ent = nla_nest_start(skb, MDBA_MDB_ENTRY_INFO); + nest_ent = nla_nest_start_noflag(skb, + MDBA_MDB_ENTRY_INFO); if (!nest_ent) { nla_nest_cancel(skb, nest2); err = -EMSGSIZE; @@ -248,10 +249,10 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb, memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; - nest = nla_nest_start(skb, MDBA_MDB); + nest = nla_nest_start_noflag(skb, MDBA_MDB); if (nest == NULL) goto cancel; - nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY); + nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY); if (nest2 == NULL) goto end; @@ -444,7 +445,7 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb, memset(bpm, 0, sizeof(*bpm)); bpm->family = AF_BRIDGE; bpm->ifindex = dev->ifindex; - nest = nla_nest_start(skb, MDBA_ROUTER); + nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (!nest) goto cancel; @@ -529,8 +530,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; int err; - err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL, - NULL); + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, NULL, NULL); if (err < 0) return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index a0e369179f6d..c2a30f79a9d0 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -44,7 +44,6 @@ static const struct rhashtable_params br_mdb_rht_params = { .key_offset = offsetof(struct net_bridge_mdb_entry, addr), .key_len = sizeof(struct br_ip), .automatic_shrinking = true, - .locks_mul = 1, }; static void br_multicast_start_querier(struct net_bridge *br, @@ -65,23 +64,6 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, __u16 vid, const unsigned char *src); #endif -static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b) -{ - if (a->proto != b->proto) - return 0; - if (a->vid != b->vid) - return 0; - switch (a->proto) { - case htons(ETH_P_IP): - return a->u.ip4 == b->u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - case htons(ETH_P_IPV6): - return ipv6_addr_equal(&a->u.ip6, &b->u.ip6); -#endif - } - return 0; -} - static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, struct br_ip *dst) { @@ -517,7 +499,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( if (src) memcpy(p->eth_addr, src, ETH_ALEN); else - memset(p->eth_addr, 0xff, ETH_ALEN); + eth_broadcast_addr(p->eth_addr); return p; } @@ -601,6 +583,7 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, if (ipv4_is_local_multicast(group)) return 0; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; @@ -1497,6 +1480,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, own_query = port ? &port->ip4_own_query : &br->ip4_own_query; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; @@ -1520,6 +1504,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, own_query = port ? &port->ip6_own_query : &br->ip6_own_query; + memset(&br_group, 0, sizeof(br_group)); br_group.u.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; @@ -2028,7 +2013,8 @@ static void br_multicast_start_querier(struct net_bridge *br, __br_multicast_open(br, query); - list_for_each_entry(port, &br->port_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(port, &br->port_list, list) { if (port->state == BR_STATE_DISABLED || port->state == BR_STATE_BLOCKING) continue; @@ -2040,6 +2026,7 @@ static void br_multicast_start_querier(struct net_bridge *br, br_multicast_enable(&port->ip6_own_query); #endif } + rcu_read_unlock(); } int br_multicast_toggle(struct net_bridge *br, unsigned long val) @@ -2189,7 +2176,7 @@ int br_multicast_list_adjacent(struct net_device *dev, int count = 0; rcu_read_lock(); - if (!br_ip_list || !br_port_exists(dev)) + if (!br_ip_list || !netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); @@ -2236,7 +2223,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) bool ret = false; rcu_read_lock(); - if (!br_port_exists(dev)) + if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); @@ -2272,7 +2259,7 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) bool ret = false; rcu_read_lock(); - if (!br_port_exists(dev)) + if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 9c07591b0232..a5acad29cd4f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -102,7 +102,7 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, size_t vinfo_sz = 0; rcu_read_lock(); - if (br_port_exists(dev)) { + if (netif_is_bridge_port(dev)) { p = br_port_get_rcu(dev); vg = nbp_vlan_group_rcu(p); } else if (dev->priv_flags & IFF_EBRIDGE) { @@ -413,9 +413,9 @@ static int br_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; if (event == RTM_NEWLINK && port) { - struct nlattr *nest - = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); + struct nlattr *nest; + nest = nla_nest_start(skb, IFLA_PROTINFO); if (nest == NULL || br_port_fill_attrs(skb, port) < 0) goto nla_put_failure; nla_nest_end(skb, nest); @@ -439,7 +439,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); goto done; } - af = nla_nest_start(skb, IFLA_AF_SPEC); + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { rcu_read_unlock(); goto nla_put_failure; @@ -880,8 +880,10 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { - err = nla_parse_nested(tb, IFLA_BRPORT_MAX, protinfo, - br_port_policy, NULL); + err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX, + protinfo, + br_port_policy, + NULL); if (err) return err; @@ -1441,7 +1443,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, - br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT))) + br_opt_get(br, BROPT_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -1569,7 +1571,7 @@ static int br_fill_linkxstats(struct sk_buff *skb, return -EINVAL; } - nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index da8cb99fd259..34629d558709 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -97,7 +97,7 @@ static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid, __be32 tid = tunnel_id_to_key32(tunnel_id); struct nlattr *tmap; - tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO); + tmap = nla_nest_start_noflag(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO); if (!tmap) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID, @@ -230,8 +230,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, memset(tinfo, 0, sizeof(*tinfo)); - err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, attr, - vlan_tunnel_policy, NULL); + err = nla_parse_nested_deprecated(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, + attr, vlan_tunnel_policy, NULL); if (err < 0) return err; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 00deef7fc1f3..334a8c496b50 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -288,8 +288,6 @@ struct net_bridge_port { #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) #define br_promisc_port(p) ((p)->flags & BR_PROMISC) -#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT) - static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); @@ -297,13 +295,13 @@ static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *d static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev) { - return br_port_exists(dev) ? + return netif_is_bridge_port(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev) { - return br_port_exists(dev) ? + return netif_is_bridge_port(dev) ? rcu_dereference_rtnl(dev->rx_handler_data) : NULL; } @@ -323,6 +321,7 @@ enum net_bridge_opts { BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, + BROPT_VLAN_BRIDGE_BINDING, }; struct net_bridge { @@ -427,15 +426,16 @@ struct br_input_skb_cb { struct net_device *brdev; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - int igmp; - int mrouters_only; + u8 igmp; + u8 mrouters_only:1; #endif - - bool proxyarp_replied; - bool src_port_isolated; - + u8 proxyarp_replied:1; + u8 src_port_isolated:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING - bool vlan_filtered; + u8 vlan_filtered:1; +#endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + u8 br_netfilter_broute:1; #endif #ifdef CONFIG_NET_SWITCHDEV @@ -896,6 +896,9 @@ int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats); +void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); +void br_vlan_bridge_event(struct net_device *dev, unsigned long event, + void *ptr); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -1079,6 +1082,16 @@ static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, struct br_vlan_stats *stats) { } + +static inline void br_vlan_port_event(struct net_bridge_port *p, + unsigned long event) +{ +} + +static inline void br_vlan_bridge_event(struct net_device *dev, + unsigned long event, void *ptr) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 808e2b914015..8d65ae5210e0 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -117,7 +117,8 @@ void br_stp_disable_port(struct net_bridge_port *p) del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); - br_fdb_delete_by_port(br, p, 0, 0); + if (!rcu_access_pointer(p->backup_port)) + br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 96abf8feb9dc..2db63997f313 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -7,6 +7,8 @@ #include "br_private.h" #include "br_private_tunnel.h" +static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); + static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { @@ -21,7 +23,6 @@ static const struct rhashtable_params br_vlan_rht_params = { .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, - .locks_mul = 1, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, @@ -294,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, __vlan_add_list(v); __vlan_add_flags(v, flags); + + if (p) + nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; @@ -358,6 +362,7 @@ static int __vlan_del(struct net_bridge_vlan *v) rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); + nbp_vlan_set_vlan_dev_state(p, v->vid); call_rcu(&v->rcu, nbp_vlan_rcu_free); } @@ -1265,3 +1270,211 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); + +static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) +{ + return is_vlan_dev(dev) && + !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); +} + +static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, + __always_unused void *data) +{ + return br_vlan_is_bind_vlan_dev(dev); +} + +static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) +{ + int found; + + rcu_read_lock(); + found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, + NULL); + rcu_read_unlock(); + + return !!found; +} + +struct br_vlan_bind_walk_data { + u16 vid; + struct net_device *result; +}; + +static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, + void *data_in) +{ + struct br_vlan_bind_walk_data *data = data_in; + int found = 0; + + if (br_vlan_is_bind_vlan_dev(dev) && + vlan_dev_priv(dev)->vlan_id == data->vid) { + data->result = dev; + found = 1; + } + + return found; +} + +static struct net_device * +br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) +{ + struct br_vlan_bind_walk_data data = { + .vid = vid, + }; + + rcu_read_lock(); + netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, + &data); + rcu_read_unlock(); + + return data.result; +} + +static bool br_vlan_is_dev_up(const struct net_device *dev) +{ + return !!(dev->flags & IFF_UP) && netif_oper_up(dev); +} + +static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, + struct net_device *vlan_dev) +{ + u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; + struct net_bridge_vlan_group *vg; + struct net_bridge_port *p; + bool has_carrier = false; + + if (!netif_carrier_ok(br->dev)) { + netif_carrier_off(vlan_dev); + return; + } + + list_for_each_entry(p, &br->port_list, list) { + vg = nbp_vlan_group(p); + if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { + has_carrier = true; + break; + } + } + + if (has_carrier) + netif_carrier_on(vlan_dev); + else + netif_carrier_off(vlan_dev); +} + +static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + struct net_bridge_vlan *vlan; + struct net_device *vlan_dev; + + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, + vlan->vid); + if (vlan_dev) { + if (br_vlan_is_dev_up(p->dev)) { + if (netif_carrier_ok(p->br->dev)) + netif_carrier_on(vlan_dev); + } else { + br_vlan_set_vlan_dev_state(p->br, vlan_dev); + } + } + } +} + +static void br_vlan_upper_change(struct net_device *dev, + struct net_device *upper_dev, + bool linking) +{ + struct net_bridge *br = netdev_priv(dev); + + if (!br_vlan_is_bind_vlan_dev(upper_dev)) + return; + + if (linking) { + br_vlan_set_vlan_dev_state(br, upper_dev); + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); + } else { + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, + br_vlan_has_upper_bind_vlan_dev(dev)); + } +} + +struct br_vlan_link_state_walk_data { + struct net_bridge *br; +}; + +static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, + void *data_in) +{ + struct br_vlan_link_state_walk_data *data = data_in; + + if (br_vlan_is_bind_vlan_dev(vlan_dev)) + br_vlan_set_vlan_dev_state(data->br, vlan_dev); + + return 0; +} + +static void br_vlan_link_state_change(struct net_device *dev, + struct net_bridge *br) +{ + struct br_vlan_link_state_walk_data data = { + .br = br + }; + + rcu_read_lock(); + netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, + &data); + rcu_read_unlock(); +} + +/* Must be protected by RTNL. */ +static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) +{ + struct net_device *vlan_dev; + + if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) + return; + + vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); + if (vlan_dev) + br_vlan_set_vlan_dev_state(p->br, vlan_dev); +} + +/* Must be protected by RTNL. */ +void br_vlan_bridge_event(struct net_device *dev, unsigned long event, + void *ptr) +{ + struct netdev_notifier_changeupper_info *info; + struct net_bridge *br; + + switch (event) { + case NETDEV_CHANGEUPPER: + info = ptr; + br_vlan_upper_change(dev, info->upper_dev, info->linking); + break; + + case NETDEV_CHANGE: + case NETDEV_UP: + br = netdev_priv(dev); + if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) + return; + br_vlan_link_state_change(dev, br); + break; + } +} + +/* Must be protected by RTNL. */ +void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) +{ + if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) + return; + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_DOWN: + case NETDEV_UP: + br_vlan_set_all_vlan_dev_state(p); + break; + } +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6d2c4eed2dc8..758151863669 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -34,7 +34,6 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = { .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), .key_len = sizeof(__be64), .nelem_hint = 3, - .locks_mul = 1, .obj_cmpfn = br_vlan_tunid_cmp, .automatic_shrinking = true, }; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 276b60262981..ec2652a459da 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -15,6 +15,8 @@ #include <linux/module.h> #include <linux/if_bridge.h> +#include "../br_private.h" + /* EBT_ACCEPT means the frame will be bridged * EBT_DROP means the frame will be routed */ @@ -48,30 +50,63 @@ static const struct ebt_table broute_table = { .me = THIS_MODULE, }; -static int ebt_broute(struct sk_buff *skb) +static unsigned int ebt_broute(void *priv, struct sk_buff *skb, + const struct nf_hook_state *s) { + struct net_bridge_port *p = br_port_get_rcu(skb->dev); struct nf_hook_state state; + unsigned char *dest; int ret; + if (!p || p->state != BR_STATE_FORWARDING) + return NF_ACCEPT; + nf_hook_state_init(&state, NF_BR_BROUTING, - NFPROTO_BRIDGE, skb->dev, NULL, NULL, - dev_net(skb->dev), NULL); + NFPROTO_BRIDGE, s->in, NULL, NULL, + s->net, NULL); ret = ebt_do_table(skb, &state, state.net->xt.broute_table); - if (ret == NF_DROP) - return 1; /* route it */ - return 0; /* bridge it */ + + if (ret != NF_DROP) + return ret; + + /* DROP in ebtables -t broute means that the + * skb should be routed, not bridged. + * This is awkward, but can't be changed for compatibility + * reasons. + * + * We map DROP to ACCEPT and set the ->br_netfilter_broute flag. + */ + BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1; + + /* undo PACKET_HOST mangling done in br_input in case the dst + * address matches the logical bridge but not the port. + */ + dest = eth_hdr(skb)->h_dest; + if (skb->pkt_type == PACKET_HOST && + !ether_addr_equal(skb->dev->dev_addr, dest) && + ether_addr_equal(p->br->dev->dev_addr, dest)) + skb->pkt_type = PACKET_OTHERHOST; + + return NF_ACCEPT; } +static const struct nf_hook_ops ebt_ops_broute = { + .hook = ebt_broute, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_FIRST, +}; + static int __net_init broute_net_init(struct net *net) { - return ebt_register_table(net, &broute_table, NULL, + return ebt_register_table(net, &broute_table, &ebt_ops_broute, &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table, NULL); + ebt_unregister_table(net, net->xt.broute_table, &ebt_ops_broute); } static struct pernet_operations broute_net_ops = { @@ -81,21 +116,11 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret; - - ret = register_pernet_subsys(&broute_net_ops); - if (ret < 0) - return ret; - /* see br_input.c */ - RCU_INIT_POINTER(br_should_route_hook, - (br_should_route_hook_t *)ebt_broute); - return 0; + return register_pernet_subsys(&broute_net_ops); } static void __exit ebtable_broute_fini(void) { - RCU_INIT_POINTER(br_should_route_hook, NULL); - synchronize_net(); unregister_pernet_subsys(&broute_net_ops); } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index eb15891f8b9f..6b07e4978eb3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1221,10 +1221,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, mutex_unlock(&ebt_mutex); WRITE_ONCE(*res, table); - - if (!ops) - return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); @@ -1248,8 +1244,7 @@ out: void ebt_unregister_table(struct net *net, struct ebt_table *table, const struct nf_hook_ops *ops) { - if (ops) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ebt_unregister_table(net, table); } @@ -2032,7 +2027,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - if (WARN_ON(type == EBT_COMPAT_TARGET && size_left)) + /* rule should have no remaining data after target */ + if (type == EBT_COMPAT_TARGET && size_left) return -EINVAL; match32 = (struct compat_ebt_entry_mwt *) buf; @@ -2157,7 +2153,9 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user, if (ret < 0) return ret; - WARN_ON(size_remaining); + if (size_remaining) + return -EINVAL; + return state->buf_kern_offset; } diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 711d7156efd8..6c6e01963aac 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -186,15 +186,19 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { + struct Qdisc *sch; + /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); - qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc)); - - if (likely(qlen == 0)) + sch = rcu_dereference_bh(txq->qdisc); + if (likely(qdisc_is_empty(sch))) goto noxoff; + /* can check for explicit qdisc len value only !NOLOCK, + * always set flow off otherwise + */ high = (caifd->netdev->tx_queue_len * q_high) / 100; - if (likely(qlen < high)) + if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } diff --git a/net/can/af_can.c b/net/can/af_can.c index 1684ba5b51eb..e8fd5dc1780a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -89,13 +89,7 @@ static atomic_t skbcounter = ATOMIC_INIT(0); int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct sock *sk = sock->sk; - switch (cmd) { - - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - default: return -ENOIOCTLCMD; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 79bb8afa9c0c..a34ee52f19ea 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1689,6 +1689,7 @@ static const struct proto_ops bcm_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, diff --git a/net/can/gw.c b/net/can/gw.c index 53859346dc9a..5275ddf580bc 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -662,8 +662,8 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); - err = nlmsg_parse(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, - cgw_policy, NULL); + err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, + CGW_MAX, cgw_policy, NULL); if (err < 0) return err; diff --git a/net/can/raw.c b/net/can/raw.c index c70207537488..afcbff063a67 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -846,6 +846,7 @@ static const struct proto_ops raw_ops = { .getname = raw_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index fa9530dd876e..6f739de28918 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2398,7 +2398,7 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + dout("%s req %p tid %llu cb %ps result %d\n", __func__, req, req->r_tid, req->r_callback, req->r_result); if (req->r_callback) diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d3736f5bffec..74cafc0142ea 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, pages + got); + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/compat.c b/net/compat.c index eeea5eb71639..a031bd333092 100644 --- a/net/compat.c +++ b/net/compat.c @@ -395,63 +395,6 @@ COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __compat_sys_setsockopt(fd, level, optname, optval, optlen); } -int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) -{ - struct compat_timeval __user *ctv; - int err; - struct timeval tv; - - if (COMPAT_USE_64BIT_TIME) - return sock_get_timestamp(sk, userstamp); - - ctv = (struct compat_timeval __user *) userstamp; - err = -ENOENT; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sock_read_timestamp(sk)); - - if (tv.tv_sec == -1) - return err; - if (tv.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - tv = ktime_to_timeval(kt); - } - err = 0; - if (put_user(tv.tv_sec, &ctv->tv_sec) || - put_user(tv.tv_usec, &ctv->tv_usec)) - err = -EFAULT; - return err; -} -EXPORT_SYMBOL(compat_sock_get_timestamp); - -int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct compat_timespec __user *ctv; - int err; - struct timespec ts; - - if (COMPAT_USE_64BIT_TIME) - return sock_get_timestampns (sk, userstamp); - - ctv = (struct compat_timespec __user *) userstamp; - err = -ENOENT; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sock_read_timestamp(sk)); - if (ts.tv_sec == -1) - return err; - if (ts.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - ts = ktime_to_timespec(kt); - } - err = 0; - if (put_user(ts.tv_sec, &ctv->tv_sec) || - put_user(ts.tv_nsec, &ctv->tv_nsec)) - err = -EFAULT; - return err; -} -EXPORT_SYMBOL(compat_sock_get_timestampns); - static int __compat_sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) diff --git a/net/core/Makefile b/net/core/Makefile index f97d6254e564..a104dc8faafc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c new file mode 100644 index 000000000000..cc9597a87770 --- /dev/null +++ b/net/core/bpf_sk_storage.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <net/bpf_sk_storage.h> +#include <net/sock.h> +#include <uapi/linux/btf.h> + +static atomic_t cache_idx; + +struct bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_sk_storage_elem. + * Instead, the sk->sk_bpf_storage is. + * + * The map (bpf_sk_storage_map) is for two purposes + * 1. Define the size of the "sk local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular sk, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * sk->sk_bpf_storage. + * + * Hence, consider sk->sk_bpf_storage is the mini-map + * with the "bpf_map" pointer as the searching key. + */ +struct bpf_sk_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_sk_storage_data { + /* smap is used as the searching key when looking up + * from sk->sk_bpf_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_sk_storage_map __rcu *smap; + u8 data[0] __aligned(8); +}; + +/* Linked to bpf_sk_storage and bpf_sk_storage_map */ +struct bpf_sk_storage_elem { + struct hlist_node map_node; /* Linked to bpf_sk_storage_map */ + struct hlist_node snode; /* Linked to bpf_sk_storage */ + struct bpf_sk_storage __rcu *sk_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_sk_storage_data sdata ____cacheline_aligned; +}; + +#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) +#define BPF_SK_STORAGE_CACHE_SIZE 16 + +struct bpf_sk_storage { + struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_sk_storage_elem */ + struct sock *sk; /* The sk that owns the the above "list" of + * bpf_sk_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +static struct bucket *select_bucket(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int omem_charge(struct sock *sk, unsigned int size) +{ + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; +} + +static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap, + struct sock *sk, void *value, + bool charge_omem) +{ + struct bpf_sk_storage_elem *selem; + + if (charge_omem && omem_charge(sk, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + return NULL; +} + +/* sk_storage->lock must be held and selem->sk_storage == sk_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem, + bool uncharge_omem) +{ + struct bpf_sk_storage_map *smap; + bool free_sk_storage; + struct sock *sk; + + smap = rcu_dereference(SDATA(selem)->smap); + sk = sk_storage->sk; + + /* All uncharging on sk->sk_omem_alloc must be done first. + * sk may be freed once the last selem is unlinked from sk_storage. + */ + if (uncharge_omem) + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + + free_sk_storage = hlist_is_singular_node(&selem->snode, + &sk_storage->list); + if (free_sk_storage) { + atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc); + sk_storage->sk = NULL; + /* After this RCU_INIT, sk may be freed and cannot be used */ + RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); + + /* sk_storage is not freed now. sk_storage->lock is + * still held and raw_spin_unlock_bh(&sk_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(sk_storage, rcu) is done + * after the raw_spin_unlock_bh(&sk_storage->lock). + * + * Hence, a "bool free_sk_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_sk_storage; +} + +static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + + if (unlikely(!selem_linked_to_sk(selem))) + /* selem has already been unlinked from sk */ + return; + + sk_storage = rcu_dereference(selem->sk_storage); + raw_spin_lock_bh(&sk_storage->lock); + if (likely(selem_linked_to_sk(selem))) + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + raw_spin_unlock_bh(&sk_storage->lock); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +/* sk_storage->lock must be held and sk_storage->list cannot be empty */ +static void __selem_link_sk(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->sk_storage, sk_storage); + hlist_add_head(&selem->snode, &sk_storage->list); +} + +static void selem_unlink_map(struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_map *smap; + struct bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_link_map(struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +static void selem_unlink(struct bpf_sk_storage_elem *selem) +{ + /* Always unlink from map before unlinking from sk_storage + * because selem will be freed after successfully unlinked from + * the sk_storage. + */ + selem_unlink_map(selem); + selem_unlink_sk(selem); +} + +static struct bpf_sk_storage_data * +__sk_storage_lookup(struct bpf_sk_storage *sk_storage, + struct bpf_sk_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_sk_storage_data *sdata; + struct bpf_sk_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next __sk_storage_lookup(). + */ + raw_spin_lock_bh(&sk_storage->lock); + if (selem_linked_to_sk(selem)) + rcu_assign_pointer(sk_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&sk_storage->lock); + } + + return sdata; +} + +static struct bpf_sk_storage_data * +sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) + return NULL; + + smap = (struct bpf_sk_storage_map *)map; + return __sk_storage_lookup(sk_storage, smap, cacheit_lockit); +} + +static int check_flags(const struct bpf_sk_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +static int sk_storage_alloc(struct sock *sk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *first_selem) +{ + struct bpf_sk_storage *prev_sk_storage, *sk_storage; + int err; + + err = omem_charge(sk, sizeof(*sk_storage)); + if (err) + return err; + + sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); + if (!sk_storage) { + err = -ENOMEM; + goto uncharge; + } + INIT_HLIST_HEAD(&sk_storage->list); + raw_spin_lock_init(&sk_storage->lock); + sk_storage->sk = sk; + + __selem_link_sk(sk_storage, first_selem); + selem_link_map(smap, first_selem); + /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. + * Hence, atomic ops is used to set sk->sk_bpf_storage + * from NULL to the newly allocated sk_storage ptr. + * + * From now on, the sk->sk_bpf_storage pointer is protected + * by the sk_storage->lock. Hence, when freeing + * the sk->sk_bpf_storage, the sk_storage->lock must + * be held before setting sk->sk_bpf_storage to NULL. + */ + prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage, + NULL, sk_storage); + if (unlikely(prev_sk_storage)) { + selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_sk_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(sk_storage); + atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk, + struct bpf_map *map, + void *value, + u64 map_flags) +{ + struct bpf_sk_storage_data *old_sdata = NULL; + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_map *smap; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + return ERR_PTR(-EINVAL); + + smap = (struct bpf_sk_storage_map *)map; + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage || hlist_empty(&sk_storage->list)) { + /* Very first elem for this sk */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = selem_alloc(smap, sk, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = sk_storage_alloc(sk, smap, selem); + if (err) { + kfree(selem); + atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the sk_storage->lock + * and changing the lists. + */ + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) { + copy_map_value_locked(map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&sk_storage->lock); + + /* Recheck sk_storage->list under sk_storage->lock */ + if (unlikely(hlist_empty(&sk_storage->list))) { + /* A parallel del is happening and sk_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = __sk_storage_lookup(sk_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(map, old_sdata->data, value, false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* sk_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during __selem_unlink_sk(). + */ + selem = selem_alloc(smap, sk, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to sk_storage */ + __selem_link_sk(sk_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + selem_unlink_map(SELEM(old_sdata)); + __selem_unlink_sk(sk_storage, SELEM(old_sdata), false); + } + +unlock: + raw_spin_unlock_bh(&sk_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&sk_storage->lock); + return ERR_PTR(err); +} + +static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +{ + struct bpf_sk_storage_data *sdata; + + sdata = sk_storage_lookup(sk, map, false); + if (!sdata) + return -ENOENT; + + selem_unlink(SELEM(sdata)); + + return 0; +} + +/* Called by __sk_destruct() */ +void bpf_sk_storage_free(struct sock *sk) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage *sk_storage; + bool free_sk_storage = false; + struct hlist_node *n; + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + if (!sk_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the sk_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * sk_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_sk_storage_map_free() alone + * when unlinking elem from the sk_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&sk_storage->lock); + hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { + /* Always unlink from map before unlinking from + * sk_storage. + */ + selem_unlink_map(selem); + free_sk_storage = __selem_unlink_sk(sk_storage, selem, true); + } + raw_spin_unlock_bh(&sk_storage->lock); + rcu_read_unlock(); + + if (free_sk_storage) + kfree_rcu(sk_storage, rcu); +} + +static void bpf_sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_sk_storage_elem *selem; + struct bpf_sk_storage_map *smap; + struct bucket *b; + unsigned int i; + + smap = (struct bpf_sk_storage_map *)map; + + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the sk->sk_bpf_storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_sk_storage_elem, + map_node))) { + selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* bpf_sk_storage_free() may still need to access the map. + * e.g. bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exited immediately. + * + * However, the bpf_sk_storage_free() still needs to access + * the smap->elem_size to do the uncharging in + * __selem_unlink_sk(). + * + * Hence, wait another rcu grace period for the + * bpf_sk_storage_free() to finish. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(map); +} + +static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) || + /* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ + attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem)) + return -E2BIG; + + return 0; +} + +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_sk_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); + nbuckets = 1U << smap->bucket_log; + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + kfree(smap); + return ERR_PTR(-ENOMEM); + } + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size; + smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) % + BPF_SK_STORAGE_CACHE_SIZE; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + return &smap->map; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static int bpf_sk_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} + +static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_lookup(sock->sk, map, true); + sockfd_put(sock); + return sdata ? sdata->data : NULL; + } + + return ERR_PTR(err); +} + +static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_sk_storage_data *sdata; + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + sdata = sk_storage_update(sock->sk, map, value, map_flags); + sockfd_put(sock); + return PTR_ERR_OR_ZERO(sdata); + } + + return err; +} + +static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct socket *sock; + int fd, err; + + fd = *(int *)key; + sock = sockfd_lookup(fd, &err); + if (sock) { + err = sk_storage_delete(sock->sk, map); + sockfd_put(sock); + return err; + } + + return err; +} + +BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + struct bpf_sk_storage_data *sdata; + + if (flags > BPF_SK_STORAGE_GET_F_CREATE) + return (unsigned long)NULL; + + sdata = sk_storage_lookup(sk, map, true); + if (sdata) + return (unsigned long)sdata->data; + + if (flags == BPF_SK_STORAGE_GET_F_CREATE && + /* Cannot add new elem to a going away sk. + * Otherwise, the new elem may become a leak + * (and also other memory issues during map + * destruction). + */ + refcount_inc_not_zero(&sk->sk_refcnt)) { + sdata = sk_storage_update(sk, map, value, BPF_NOEXIST); + /* sk must be a fullsock (guaranteed by verifier), + * so sock_gen_put() is unnecessary. + */ + sock_put(sk); + return IS_ERR(sdata) ? + (unsigned long)NULL : (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) +{ + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + int err; + + err = sk_storage_delete(sk, map); + sock_put(sk); + return err; + } + + return -ENOENT; +} + +const struct bpf_map_ops sk_storage_map_ops = { + .map_alloc_check = bpf_sk_storage_map_alloc_check, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, + .map_update_elem = bpf_fd_sk_storage_update_elem, + .map_delete_elem = bpf_fd_sk_storage_delete_elem, + .map_check_btf = bpf_sk_storage_map_check_btf, +}; + +const struct bpf_func_proto bpf_sk_storage_get_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_SOCKET, +}; diff --git a/net/core/datagram.c b/net/core/datagram.c index b2651bb6d2a3..45a162ef5e02 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -61,6 +61,8 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> +#include "datagram.h" + /* * Is a socket 'connection oriented' ? */ @@ -165,7 +167,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last) { bool peek_at_off = false; @@ -192,7 +194,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, return NULL; } } - *peeked = 1; refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); @@ -210,7 +211,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, * @sk: socket * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue - * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned @@ -244,7 +244,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last) { struct sk_buff_head *queue = &sk->sk_receive_queue; @@ -258,7 +258,6 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; - *peeked = 0; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -268,7 +267,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, - peeked, off, &error, last); + off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; @@ -279,7 +278,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, break; sk_busy_loop(sk, flags & MSG_DONTWAIT); - } while (!skb_queue_empty(&sk->sk_receive_queue)); + } while (sk->sk_receive_queue.prev != *last); error = -EAGAIN; @@ -292,7 +291,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err) + int *off, int *err) { struct sk_buff *skb, *last; long timeo; @@ -300,8 +299,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, destructor, peeked, - off, err, &last); + skb = __skb_try_recv_datagram(sk, flags, destructor, off, err, + &last); if (skb) return skb; @@ -317,10 +316,10 @@ EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int noblock, int *err) { - int peeked, off = 0; + int off = 0; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - NULL, &peeked, &off, err); + NULL, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -408,10 +407,10 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -int __skb_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, bool fault_short, - size_t (*cb)(const void *, size_t, void *, struct iov_iter *), - void *data) +static int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, + struct iov_iter *), void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; diff --git a/net/core/datagram.h b/net/core/datagram.h new file mode 100644 index 000000000000..bcfb75bfa3b2 --- /dev/null +++ b/net/core/datagram.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NET_CORE_DATAGRAM_H_ +#define _NET_CORE_DATAGRAM_H_ + +#include <linux/types.h> + +struct sock; +struct sk_buff; +struct iov_iter; + +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +#endif /* _NET_CORE_DATAGRAM_H_ */ diff --git a/net/core/dev.c b/net/core/dev.c index 2b67f2aa59dd..108ac8137b9b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -131,7 +131,6 @@ #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> -#include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> @@ -146,6 +145,7 @@ #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> +#include <net/devlink.h> #include "net-sysfs.h" @@ -1184,7 +1184,21 @@ int dev_change_name(struct net_device *dev, const char *newname) BUG_ON(!dev_net(dev)); net = dev_net(dev); - if (dev->flags & IFF_UP) + + /* Some auto-enslaved devices e.g. failover slaves are + * special, as userspace might rename the device after + * the interface had been brought up and running since + * the point kernel initiated auto-enslavement. Allow + * live name change even when these slave devices are + * up and running. + * + * Typically, users of these auto-enslaving devices + * don't actually care about slave name change, as + * they are supposed to operate on master interface + * directly. + */ + if (dev->flags & IFF_UP && + likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; write_seqcount_begin(&devnet_rename_seq); @@ -3468,6 +3482,15 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + qdisc_run_begin(q)) { + qdisc_bstats_cpu_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) + __qdisc_run(q); + + qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; qdisc_run(q); @@ -3556,9 +3579,6 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -DEFINE_PER_CPU(int, xmit_recursion); -EXPORT_SYMBOL(xmit_recursion); - /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in @@ -3689,23 +3709,21 @@ get_cpus_map: } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; } EXPORT_SYMBOL(dev_pick_tx_cpu_id); -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) +u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -3729,10 +3747,11 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, return queue_index; } +EXPORT_SYMBOL(netdev_pick_tx); -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb, - struct net_device *sb_dev) +struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, + struct sk_buff *skb, + struct net_device *sb_dev) { int queue_index = 0; @@ -3747,10 +3766,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, sb_dev, - __netdev_pick_tx); + queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else - queue_index = __netdev_pick_tx(dev, skb, sb_dev); + queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } @@ -3824,7 +3842,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) else skb_dst_force(skb); - txq = netdev_pick_tx(dev, skb, sb_dev); + txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -3849,8 +3867,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) int cpu = smp_processor_id(); /* ok because BHs are off */ if (txq->xmit_lock_owner != cpu) { - if (unlikely(__this_cpu_read(xmit_recursion) > - XMIT_RECURSION_LIMIT)) + if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); @@ -3860,9 +3877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { - __this_cpu_inc(xmit_recursion); + dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -3975,9 +3992,9 @@ EXPORT_SYMBOL(rps_sock_flow_table); u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); -struct static_key rps_needed __read_mostly; +struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); -struct static_key rfs_needed __read_mostly; +struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * @@ -4429,7 +4446,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) bool free_skb = true; int cpu, rc; - txq = netdev_pick_tx(dev, skb, NULL); + txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -4503,7 +4520,7 @@ static int netif_rx_internal(struct sk_buff *skb) } #ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { + if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -4970,7 +4987,8 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); if (pt_prev) - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, + skb->dev, pt_prev, orig_dev); return ret; } @@ -5014,8 +5032,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, if (pt_prev->list_func != NULL) pt_prev->list_func(head, pt_prev, orig_dev); else - list_for_each_entry_safe(skb, next, head, list) - pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, + skb->dev, pt_prev, orig_dev); + } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) @@ -5172,7 +5193,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); #ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { + if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -5220,7 +5241,7 @@ static void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); #ifdef CONFIG_RPS - if (static_key_false(&rps_needed)) { + if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -7870,10 +7891,14 @@ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; + int err; - if (!ops->ndo_get_phys_port_name) - return -EOPNOTSUPP; - return ops->ndo_get_phys_port_name(dev, name, len); + if (ops->ndo_get_phys_port_name) { + err = ops->ndo_get_phys_port_name(dev, name, len); + if (err != -EOPNOTSUPP) + return err; + } + return devlink_compat_phys_port_name_get(dev, name, len); } EXPORT_SYMBOL(dev_get_phys_port_name); @@ -7893,14 +7918,21 @@ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; - int err = -EOPNOTSUPP; + int err; - if (ops->ndo_get_port_parent_id) - return ops->ndo_get_port_parent_id(dev, ppid); + if (ops->ndo_get_port_parent_id) { + err = ops->ndo_get_port_parent_id(dev, ppid); + if (err != -EOPNOTSUPP) + return err; + } - if (!recurse) + err = devlink_compat_switch_id_get(dev, ppid); + if (!err || err != -EOPNOTSUPP) return err; + if (!recurse) + return -EOPNOTSUPP; + netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, recurse); if (err) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 31380fd5a4e2..5163d900bb4f 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -366,7 +366,8 @@ EXPORT_SYMBOL(dev_load); * dev_ioctl - network device ioctl * @net: the applicable net namespace * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space + * @ifr: pointer to a struct ifreq in user space + * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the * user space syscall interfaces but can sometimes be useful for diff --git a/net/core/devlink.c b/net/core/devlink.c index da0a29f30885..d43bc52b8840 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -19,6 +19,8 @@ #include <linux/device.h> #include <linux/list.h> #include <linux/netdevice.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> #include <rdma/ib_verbs.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -543,12 +545,14 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; + + spin_lock(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) - goto nla_put_failure; + goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) - goto nla_put_failure; + goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { struct net_device *netdev = devlink_port->type_dev; @@ -557,7 +561,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, netdev->ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, netdev->name))) - goto nla_put_failure; + goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_dev; @@ -565,14 +569,17 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) - goto nla_put_failure; + goto nla_put_failure_type_locked; } + spin_unlock(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; +nla_put_failure_type_locked: + spin_unlock(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -1041,14 +1048,15 @@ out: static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type) + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink->ops; if (ops->sb_pool_set) return ops->sb_pool_set(devlink, sb_index, pool_index, - size, threshold_type); + size, threshold_type, extack); return -EOPNOTSUPP; } @@ -1076,7 +1084,8 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); return devlink_sb_pool_set(devlink, devlink_sb->index, - pool_index, size, threshold_type); + pool_index, size, threshold_type, + info->extack); } static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, @@ -1237,14 +1246,15 @@ out: static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, - u32 threshold) + u32 threshold, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; if (ops->sb_port_pool_set) return ops->sb_port_pool_set(devlink_port, sb_index, - pool_index, threshold); + pool_index, threshold, extack); return -EOPNOTSUPP; } @@ -1267,7 +1277,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, - pool_index, threshold); + pool_index, threshold, info->extack); } static int @@ -1466,7 +1476,8 @@ out: static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold) + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; @@ -1474,7 +1485,7 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, if (ops->sb_tc_pool_bind_set) return ops->sb_tc_pool_bind_set(devlink_port, sb_index, tc_index, pool_type, - pool_index, threshold); + pool_index, threshold, extack); return -EOPNOTSUPP; } @@ -1509,7 +1520,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, tc_index, pool_type, - pool_index, threshold); + pool_index, threshold, info->extack); } static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, @@ -1661,7 +1672,7 @@ int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_field *field = &header->fields[match->field_id]; struct nlattr *match_attr; - match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH); + match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH); if (!match_attr) return -EMSGSIZE; @@ -1686,7 +1697,8 @@ static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, { struct nlattr *matches_attr; - matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES); + matches_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_TABLE_MATCHES); if (!matches_attr) return -EMSGSIZE; @@ -1708,7 +1720,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb, struct devlink_dpipe_field *field = &header->fields[action->field_id]; struct nlattr *action_attr; - action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION); + action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION); if (!action_attr) return -EMSGSIZE; @@ -1733,7 +1745,8 @@ static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, { struct nlattr *actions_attr; - actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); + actions_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); if (!actions_attr) return -EMSGSIZE; @@ -1755,7 +1768,7 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, u64 table_size; table_size = table->table_ops->size_get(table->priv); - table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE); + table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE); if (!table_attr) return -EMSGSIZE; @@ -1835,7 +1848,7 @@ start_again: if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; - tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES); + tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES); if (!tables_attr) goto nla_put_failure; @@ -1936,8 +1949,8 @@ static int devlink_dpipe_action_values_put(struct sk_buff *skb, int err; for (i = 0; i < values_count; i++) { - action_attr = nla_nest_start(skb, - DEVLINK_ATTR_DPIPE_ACTION_VALUE); + action_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ACTION_VALUE); if (!action_attr) return -EMSGSIZE; err = devlink_dpipe_action_value_put(skb, &values[i]); @@ -1973,8 +1986,8 @@ static int devlink_dpipe_match_values_put(struct sk_buff *skb, int err; for (i = 0; i < values_count; i++) { - match_attr = nla_nest_start(skb, - DEVLINK_ATTR_DPIPE_MATCH_VALUE); + match_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_MATCH_VALUE); if (!match_attr) return -EMSGSIZE; err = devlink_dpipe_match_value_put(skb, &values[i]); @@ -1995,7 +2008,7 @@ static int devlink_dpipe_entry_put(struct sk_buff *skb, struct nlattr *entry_attr, *matches_attr, *actions_attr; int err; - entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY); + entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY); if (!entry_attr) return -EMSGSIZE; @@ -2007,8 +2020,8 @@ static int devlink_dpipe_entry_put(struct sk_buff *skb, entry->counter, DEVLINK_ATTR_PAD)) goto nla_put_failure; - matches_attr = nla_nest_start(skb, - DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); + matches_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); if (!matches_attr) goto nla_put_failure; @@ -2020,8 +2033,8 @@ static int devlink_dpipe_entry_put(struct sk_buff *skb, } nla_nest_end(skb, matches_attr); - actions_attr = nla_nest_start(skb, - DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); + actions_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); if (!actions_attr) goto nla_put_failure; @@ -2078,8 +2091,8 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) devlink = dump_ctx->info->user_ptr[0]; if (devlink_nl_put_handle(dump_ctx->skb, devlink)) goto nla_put_failure; - dump_ctx->nest = nla_nest_start(dump_ctx->skb, - DEVLINK_ATTR_DPIPE_ENTRIES); + dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb, + DEVLINK_ATTR_DPIPE_ENTRIES); if (!dump_ctx->nest) goto nla_put_failure; return 0; @@ -2189,7 +2202,8 @@ static int devlink_dpipe_fields_put(struct sk_buff *skb, for (i = 0; i < header->fields_count; i++) { field = &header->fields[i]; - field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD); + field_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_FIELD); if (!field_attr) return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || @@ -2212,7 +2226,7 @@ static int devlink_dpipe_header_put(struct sk_buff *skb, struct nlattr *fields_attr, *header_attr; int err; - header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER); + header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER); if (!header_attr) return -EMSGSIZE; @@ -2221,7 +2235,8 @@ static int devlink_dpipe_header_put(struct sk_buff *skb, nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) goto nla_put_failure; - fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS); + fields_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_HEADER_FIELDS); if (!fields_attr) goto nla_put_failure; @@ -2268,7 +2283,7 @@ start_again: if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; - headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS); + headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS); if (!headers_attr) goto nla_put_failure; @@ -2492,7 +2507,7 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, struct nlattr *child_resource_attr; struct nlattr *resource_attr; - resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); if (!resource_attr) return -EMSGSIZE; @@ -2516,7 +2531,8 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, resource->size_valid)) goto nla_put_failure; - child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + child_resource_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_RESOURCE_LIST); if (!child_resource_attr) goto nla_put_failure; @@ -2567,7 +2583,8 @@ start_again: if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; - resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + resources_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_RESOURCE_LIST); if (!resources_attr) goto nla_put_failure; @@ -2821,7 +2838,8 @@ devlink_nl_param_value_fill_one(struct sk_buff *msg, { struct nlattr *param_value_attr; - param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE); + param_value_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUE); if (!param_value_attr) goto nla_put_failure; @@ -2912,7 +2930,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) goto genlmsg_cancel; - param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); + param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) @@ -2926,7 +2944,8 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) goto param_nest_cancel; - param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); + param_values_list = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUES_LIST); if (!param_values_list) goto param_nest_cancel; @@ -3326,7 +3345,7 @@ static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct nlattr *snap_attr; int err; - snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT); + snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); if (!snap_attr) return -EINVAL; @@ -3350,7 +3369,8 @@ static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, struct nlattr *snapshots_attr; int err; - snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); + snapshots_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_REGION_SNAPSHOTS); if (!snapshots_attr) return -EINVAL; @@ -3566,7 +3586,7 @@ static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, struct nlattr *chunk_attr; int err; - chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK); + chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); if (!chunk_attr) return -EINVAL; @@ -3640,7 +3660,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { u64 ret_offset, start_offset, end_offset = 0; - const struct genl_ops *ops = cb->data; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; @@ -3656,8 +3675,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, if (!attrs) return -ENOMEM; - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack); + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, + devlink_nl_family.policy, cb->extack); if (err) goto out_free; @@ -3699,7 +3720,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, if (err) goto nla_put_failure; - chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); + chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); if (!chunks_attr) { err = -EMSGSIZE; goto nla_put_failure; @@ -3775,7 +3796,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr, struct nlattr *nest; int err; - nest = nla_nest_start(req->msg, attr); + nest = nla_nest_start_noflag(req->msg, attr); if (!nest) return -EMSGSIZE; @@ -4303,7 +4324,7 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, int i = 0; int err; - fmsg_nlattr = nla_nest_start(skb, DEVLINK_ATTR_FMSG); + fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); if (!fmsg_nlattr) return -EMSGSIZE; @@ -4412,6 +4433,7 @@ struct devlink_health_reporter { u64 error_count; u64 recovery_count; u64 last_recovery_ts; + refcount_t refcount; }; void * @@ -4427,6 +4449,7 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, { struct devlink_health_reporter *reporter; + lockdep_assert_held(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; @@ -4450,7 +4473,7 @@ devlink_health_reporter_create(struct devlink *devlink, { struct devlink_health_reporter *reporter; - mutex_lock(&devlink->lock); + mutex_lock(&devlink->reporters_lock); if (devlink_health_reporter_find_by_name(devlink, ops->name)) { reporter = ERR_PTR(-EEXIST); goto unlock; @@ -4474,9 +4497,10 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->graceful_period = graceful_period; reporter->auto_recover = auto_recover; mutex_init(&reporter->dump_lock); + refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); unlock: - mutex_unlock(&devlink->lock); + mutex_unlock(&devlink->reporters_lock); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); @@ -4489,9 +4513,12 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { - mutex_lock(&reporter->devlink->lock); + mutex_lock(&reporter->devlink->reporters_lock); list_del(&reporter->list); - mutex_unlock(&reporter->devlink->lock); + mutex_unlock(&reporter->devlink->reporters_lock); + while (refcount_read(&reporter->refcount) > 1) + msleep(100); + mutex_destroy(&reporter->dump_lock); if (reporter->dump_fmsg) devlink_fmsg_free(reporter->dump_fmsg); kfree(reporter); @@ -4627,6 +4654,7 @@ static struct devlink_health_reporter * devlink_health_reporter_get_from_info(struct devlink *devlink, struct genl_info *info) { + struct devlink_health_reporter *reporter; char *reporter_name; if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) @@ -4634,7 +4662,18 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, reporter_name = nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - return devlink_health_reporter_find_by_name(devlink, reporter_name); + mutex_lock(&devlink->reporters_lock); + reporter = devlink_health_reporter_find_by_name(devlink, reporter_name); + if (reporter) + refcount_inc(&reporter->refcount); + mutex_unlock(&devlink->reporters_lock); + return reporter; +} + +static void +devlink_health_reporter_put(struct devlink_health_reporter *reporter) +{ + refcount_dec(&reporter->refcount); } static int @@ -4654,7 +4693,8 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; - reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); + reporter_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, @@ -4708,8 +4748,10 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; + if (!msg) { + err = -ENOMEM; + goto out; + } err = devlink_nl_health_reporter_fill(msg, devlink, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, @@ -4717,10 +4759,13 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, 0); if (err) { nlmsg_free(msg); - return err; + goto out; } - return genlmsg_reply(msg, info); + err = genlmsg_reply(msg, info); +out: + devlink_health_reporter_put(reporter); + return err; } static int @@ -4737,7 +4782,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; - mutex_lock(&devlink->lock); + mutex_lock(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < start) { @@ -4751,12 +4796,12 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->lock); + mutex_unlock(&devlink->reporters_lock); goto out; } idx++; } - mutex_unlock(&devlink->lock); + mutex_unlock(&devlink->reporters_lock); } out: mutex_unlock(&devlink_mutex); @@ -4771,6 +4816,7 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) @@ -4778,8 +4824,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) - return -EOPNOTSUPP; + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -4789,7 +4837,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + devlink_health_reporter_put(reporter); return 0; +out: + devlink_health_reporter_put(reporter); + return err; } static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -4797,12 +4849,16 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; + int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; - return devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL); + + devlink_health_reporter_put(reporter); + return err; } static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, @@ -4817,12 +4873,16 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->diagnose) + if (!reporter->ops->diagnose) { + devlink_health_reporter_put(reporter); return -EOPNOTSUPP; + } fmsg = devlink_fmsg_alloc(); - if (!fmsg) + if (!fmsg) { + devlink_health_reporter_put(reporter); return -ENOMEM; + } err = devlink_fmsg_obj_nest_start(fmsg); if (err) @@ -4841,6 +4901,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, out: devlink_fmsg_free(fmsg); + devlink_health_reporter_put(reporter); return err; } @@ -4855,8 +4916,10 @@ static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->dump) + if (!reporter->ops->dump) { + devlink_health_reporter_put(reporter); return -EOPNOTSUPP; + } mutex_lock(&reporter->dump_lock); err = devlink_health_do_dump(reporter, NULL); @@ -4868,6 +4931,7 @@ static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, out: mutex_unlock(&reporter->dump_lock); + devlink_health_reporter_put(reporter); return err; } @@ -4882,12 +4946,15 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - if (!reporter->ops->dump) + if (!reporter->ops->dump) { + devlink_health_reporter_put(reporter); return -EOPNOTSUPP; + } mutex_lock(&reporter->dump_lock); devlink_health_dump_clear(reporter); mutex_unlock(&reporter->dump_lock); + devlink_health_reporter_put(reporter); return 0; } @@ -4926,293 +4993,297 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { static const struct genl_ops devlink_nl_ops[] = { { .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_get_doit, .dumpit = devlink_nl_cmd_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_get_doit, .dumpit = devlink_nl_cmd_port_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_PORT_SPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_unsplit_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, .dumpit = devlink_nl_cmd_sb_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_get_doit, .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_get_doit, .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_port_pool_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | DEVLINK_NL_FLAG_NEED_SB, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_snapshot_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_occ_max_clear_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_get, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_entries_get, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_headers_get, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_dpipe_table_counters_set, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_set, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_resource_dump, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_RELOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_reload, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_get_doit, .dumpit = devlink_nl_cmd_param_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_param_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_PORT_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_param_get_doit, .dumpit = devlink_nl_cmd_port_param_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_PORT_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_param_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_REGION_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_get_doit, .dumpit = devlink_nl_cmd_region_get_dumpit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_del, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_REGION_READ, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_cmd_region_read_dumpit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_info_get_doit, .dumpit = devlink_nl_cmd_info_get_dumpit, - .policy = devlink_nl_policy, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .policy = devlink_nl_policy, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_set_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_recover_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_dump_get_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_FLASH_UPDATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_flash_update, - .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, @@ -5222,6 +5293,7 @@ static struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, .maxattr = DEVLINK_ATTR_MAX, + .policy = devlink_nl_policy, .netnsok = true, .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, @@ -5261,6 +5333,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); mutex_init(&devlink->lock); + mutex_init(&devlink->reporters_lock); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -5303,6 +5376,8 @@ EXPORT_SYMBOL_GPL(devlink_unregister); */ void devlink_free(struct devlink *devlink) { + mutex_destroy(&devlink->reporters_lock); + mutex_destroy(&devlink->lock); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->param_list)); @@ -5340,6 +5415,7 @@ int devlink_port_register(struct devlink *devlink, devlink_port->devlink = devlink; devlink_port->index = port_index; devlink_port->registered = true; + spin_lock_init(&devlink_port->type_lock); list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); @@ -5368,8 +5444,12 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { + if (WARN_ON(!devlink_port->registered)) + return; + spin_lock(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; + spin_unlock(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } @@ -5382,8 +5462,39 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, void devlink_port_type_eth_set(struct devlink_port *devlink_port, struct net_device *netdev) { - return __devlink_port_type_set(devlink_port, - DEVLINK_PORT_TYPE_ETH, netdev); + const struct net_device_ops *ops = netdev->netdev_ops; + + /* If driver registers devlink port, it should set devlink port + * attributes accordingly so the compat functions are called + * and the original ops are not used. + */ + if (ops->ndo_get_phys_port_name) { + /* Some drivers use the same set of ndos for netdevs + * that have devlink_port registered and also for + * those who don't. Make sure that ndo_get_phys_port_name + * returns -EOPNOTSUPP here in case it is defined. + * Warn if not. + */ + char name[IFNAMSIZ]; + int err; + + err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); + WARN_ON(err != -EOPNOTSUPP); + } + if (ops->ndo_get_port_parent_id) { + /* Some drivers use the same set of ndos for netdevs + * that have devlink_port registered and also for + * those who don't. Make sure that ndo_get_port_parent_id + * returns -EOPNOTSUPP here in case it is defined. + * Warn if not. + */ + struct netdev_phys_item_id ppid; + int err; + + err = ops->ndo_get_port_parent_id(netdev, &ppid); + WARN_ON(err != -EOPNOTSUPP); + } + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -5396,8 +5507,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { - return __devlink_port_type_set(devlink_port, - DEVLINK_PORT_TYPE_IB, ibdev); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); @@ -5408,8 +5518,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); */ void devlink_port_type_clear(struct devlink_port *devlink_port) { - return __devlink_port_type_set(devlink_port, - DEVLINK_PORT_TYPE_NOTSET, NULL); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); @@ -5423,25 +5532,40 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear); * @split: indicates if this is split port * @split_subport_number: if the port is split, this is the number * of subport. + * @switch_id: if the port is part of switch, this is buffer with ID, + * otwerwise this is NULL + * @switch_id_len: length of the switch_id buffer */ void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, - u32 split_subport_number) + u32 split_subport_number, + const unsigned char *switch_id, + unsigned char switch_id_len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; + if (WARN_ON(devlink_port->registered)) + return; attrs->set = true; attrs->flavour = flavour; attrs->port_number = port_number; attrs->split = split; attrs->split_subport_number = split_subport_number; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + if (switch_id) { + attrs->switch_port = true; + if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN)) + switch_id_len = MAX_PHYS_ITEM_ID_LEN; + memcpy(attrs->switch_id.id, switch_id, switch_id_len); + attrs->switch_id.id_len = switch_id_len; + } else { + attrs->switch_port = false; + } } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, - char *name, size_t len) +static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, + char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; @@ -5471,7 +5595,6 @@ int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, return 0; } -EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, @@ -6447,17 +6570,15 @@ void devlink_compat_running_version(struct net_device *dev, dev_hold(dev); rtnl_unlock(); - mutex_lock(&devlink_mutex); devlink = netdev_to_devlink(dev); if (!devlink || !devlink->ops->info_get) - goto unlock_list; + goto out; mutex_lock(&devlink->lock); __devlink_compat_running_version(devlink, buf, len); mutex_unlock(&devlink->lock); -unlock_list: - mutex_unlock(&devlink_mutex); +out: rtnl_lock(); dev_put(dev); } @@ -6465,28 +6586,65 @@ unlock_list: int devlink_compat_flash_update(struct net_device *dev, const char *file_name) { struct devlink *devlink; - int ret = -EOPNOTSUPP; + int ret; dev_hold(dev); rtnl_unlock(); - mutex_lock(&devlink_mutex); devlink = netdev_to_devlink(dev); - if (!devlink || !devlink->ops->flash_update) - goto unlock_list; + if (!devlink || !devlink->ops->flash_update) { + ret = -EOPNOTSUPP; + goto out; + } mutex_lock(&devlink->lock); ret = devlink->ops->flash_update(devlink, file_name, NULL, NULL); mutex_unlock(&devlink->lock); -unlock_list: - mutex_unlock(&devlink_mutex); +out: rtnl_lock(); dev_put(dev); return ret; } +int devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len) +{ + struct devlink_port *devlink_port; + + /* RTNL mutex is held here which ensures that devlink_port + * instance cannot disappear in the middle. No need to take + * any devlink lock as only permanent values are accessed. + */ + ASSERT_RTNL(); + + devlink_port = netdev_to_devlink_port(dev); + if (!devlink_port) + return -EOPNOTSUPP; + + return __devlink_port_phys_port_name_get(devlink_port, name, len); +} + +int devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + struct devlink_port *devlink_port; + + /* RTNL mutex is held here which ensures that devlink_port + * instance cannot disappear in the middle. No need to take + * any devlink lock as only permanent values are accessed. + */ + ASSERT_RTNL(); + devlink_port = netdev_to_devlink_port(dev); + if (!devlink_port || !devlink_port->attrs.switch_port) + return -EOPNOTSUPP; + + memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); + + return 0; +} + static int __init devlink_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index c7785efeea57..d4ce0542acfa 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -355,14 +355,17 @@ out: static const struct genl_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, }, { .cmd = NET_DM_CMD_START, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, { .cmd = NET_DM_CMD_STOP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, }; diff --git a/net/core/dst.c b/net/core/dst.c index a263309df115..1f13d90cd0e4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -26,23 +26,6 @@ #include <net/dst.h> #include <net/dst_metadata.h> -/* - * Theory of operations: - * 1) We use a list, protected by a spinlock, to add - * new entries from both BH and non-BH context. - * 2) In order to keep spinlock held for a small delay, - * we use a second list where are stored long lived - * entries, that are handled by the garbage collect thread - * fired by a workqueue. - * 3) This list is guarded by a mutex, - * so that the gc_task and dst_dev_event() can be synchronized. - */ - -/* - * We want to keep lock & list close together - * to dirty as few cache lines as possible in __dst_free(). - * As this is not a very strong hint, we dont force an alignment on SMP. - */ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b1eb32419732..4a593853cbf2 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -136,6 +136,7 @@ static const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", + [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) @@ -1797,11 +1798,16 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) WARN_ON_ONCE(!ret); gstrings.len = ret; - data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); - if (gstrings.len && !data) - return -ENOMEM; - __ethtool_get_strings(dev, gstrings.string_set, data); + if (gstrings.len) { + data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); + if (!data) + return -ENOMEM; + + __ethtool_get_strings(dev, gstrings.string_set, data); + } else { + data = NULL; + } ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) @@ -1897,11 +1903,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return -EFAULT; stats.n_stats = n_stats; - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (n_stats && !data) - return -ENOMEM; - ops->get_ethtool_stats(dev, &stats, data); + if (n_stats) { + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; + ops->get_ethtool_stats(dev, &stats, data); + } else { + data = NULL; + } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) @@ -1941,16 +1951,21 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return -EFAULT; stats.n_stats = n_stats; - data = vzalloc(array_size(n_stats, sizeof(u64))); - if (n_stats && !data) - return -ENOMEM; - if (dev->phydev && !ops->get_ethtool_phy_stats) { - ret = phy_ethtool_get_stats(dev->phydev, &stats, data); - if (ret < 0) - return ret; + if (n_stats) { + data = vzalloc(array_size(n_stats, sizeof(u64))); + if (!data) + return -ENOMEM; + + if (dev->phydev && !ops->get_ethtool_phy_stats) { + ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (ret < 0) + goto out; + } else { + ops->get_ethtool_phy_stats(dev, &stats, data); + } } else { - ops->get_ethtool_phy_stats(dev, &stats, data); + data = NULL; } ret = -EFAULT; @@ -2432,6 +2447,7 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: + case ETHTOOL_PHY_FAST_LINK_DOWN: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; diff --git a/net/core/failover.c b/net/core/failover.c index 4a92a98ccce9..b5cd3c727285 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ffbb827723a2..43f0115cce9c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -746,7 +746,8 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + ops->policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; @@ -756,9 +757,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) goto errout; - if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_exists(ops, frh, tb, rule)) { - err = -EEXIST; + if (rule_exists(ops, frh, tb, rule)) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + err = -EEXIST; goto errout_free; } @@ -853,7 +854,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, + ops->policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index 647c63a7b25b..55bfc941d17a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,8 @@ #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> +#include <net/ipv6_stubs.h> +#include <net/bpf_sk_storage.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -1729,6 +1731,40 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_4(bpf_flow_dissector_load_bytes, + const struct bpf_flow_dissector *, ctx, u32, offset, + void *, to, u32, len) +{ + void *ptr; + + if (unlikely(offset > 0xffff)) + goto err_clear; + + if (unlikely(!ctx->skb)) + goto err_clear; + + ptr = skb_header_pointer(ctx->skb, offset, len, to); + if (unlikely(!ptr)) + goto err_clear; + if (ptr != to) + memcpy(to, ptr, len); + + return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { + .func = bpf_flow_dissector_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { @@ -2015,7 +2051,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { + if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; @@ -2023,9 +2059,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) skb->dev = dev; - __this_cpu_inc(xmit_recursion); + dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); - __this_cpu_dec(xmit_recursion); + dev_xmit_recursion_dec(); return ret; } @@ -2963,42 +2999,128 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) } } -static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + +#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ + BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ + BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ + BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2( \ + BPF_ADJ_ROOM_ENCAP_L2_MASK)) + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); + u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; + bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; + u16 mac_len = 0, inner_net = 0, inner_trans = 0; + unsigned int gso_type = SKB_GSO_DODGY; int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } - ret = skb_cow(skb, len_diff); + ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -ENOTSUPP; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + return -EINVAL; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + return -EINVAL; + + if (skb->encapsulation) + return -EALREADY; + + mac_len = skb->network_header - skb->mac_header; + inner_net = skb->network_header; + if (inner_mac_len > len_diff) + return -EINVAL; + inner_trans = skb->transport_header; + } + ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; + if (encap) { + skb->inner_mac_header = inner_net - inner_mac_len; + skb->inner_network_header = inner_net; + skb->inner_transport_header = inner_trans; + skb_set_inner_protocol(skb, skb->protocol); + + skb->encapsulation = 1; + skb_set_network_header(skb, mac_len); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) + gso_type |= SKB_GSO_UDP_TUNNEL; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) + gso_type |= SKB_GSO_GRE; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + gso_type |= SKB_GSO_IPXIP6; + else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + gso_type |= SKB_GSO_IPXIP4; + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || + flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { + int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? + sizeof(struct ipv6hdr) : + sizeof(struct iphdr); + + skb_set_transport_header(skb, mac_len + nh_len); + } + + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + } + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ - skb_decrease_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_decrease_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ - shinfo->gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; } return 0; } -static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, + u64 flags) { - u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) - return -ENOTSUPP; + if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + return -EINVAL; + + if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { + /* udp gso_size delineates datagrams, only allow if fixed */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || + !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + return -ENOTSUPP; + } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) @@ -3012,7 +3134,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ - skb_increase_gso_size(shinfo, len_diff); + if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) + skb_increase_gso_size(shinfo, len_diff); + /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; @@ -3027,49 +3151,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) SKB_MAX_ALLOC; } -static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) { - bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = __bpf_skb_max_len(skb); __be16 proto = skb->protocol; bool shrink = len_diff < 0; + u32 off; int ret; + if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + off = skb_mac_header_len(skb); + switch (mode) { + case BPF_ADJ_ROOM_NET: + off += bpf_skb_net_base_len(skb); + break; + case BPF_ADJ_ROOM_MAC: + break; + default: + return -ENOTSUPP; + } + len_cur = skb->len - skb_network_offset(skb); - if (skb_transport_header_was_set(skb) && !trans_same) - len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; - ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : - bpf_skb_net_grow(skb, len_diff_abs); + ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : + bpf_skb_net_grow(skb, off, len_diff_abs, flags); bpf_compute_data_pointers(skb); return ret; } -BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, - u32, mode, u64, flags) -{ - if (unlikely(flags)) - return -EINVAL; - if (likely(mode == BPF_ADJ_ROOM_NET)) - return bpf_skb_adjust_net(skb, len_diff); - - return -ENOTSUPP; -} - static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, @@ -4355,8 +4480,7 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; - if (val) - tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } @@ -4383,6 +4507,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, * Only binding to IP is supported. */ err = -EINVAL; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; @@ -4480,11 +4606,11 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { + struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; - struct fib_nh *nh; struct flowi4 fl4; int err; u32 mtu; @@ -4557,22 +4683,33 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, return BPF_FIB_LKUP_RET_FRAG_NEEDED; } - nh = &res.fi->fib_nh[res.nh_sel]; + nhc = res.nhc; /* do not handle lwt encaps right now */ - if (nh->nh_lwtstate) + if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; - dev = nh->nh_dev; - if (nh->nh_gw) - params->ipv4_dst = nh->nh_gw; + dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ - neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (likely(nhc->nhc_gw_family != AF_INET6)) { + if (nhc->nhc_gw_family) + params->ipv4_dst = nhc->nhc_gw.ipv4; + + neigh = __ipv4_neigh_lookup_noref(dev, + (__force u32)params->ipv4_dst); + } else { + struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; + + params->family = AF_INET6; + *dst = nhc->nhc_gw.ipv6; + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); + } + if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; @@ -4586,13 +4723,13 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; - struct fib6_info *f6i; struct flowi6 fl6; int strict = 0; - int oif; + int oif, err; u32 mtu; /* link local addresses are never forwarded */ @@ -4634,61 +4771,57 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; - f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, + strict); } else { fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); - f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } - if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || + res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; - if (unlikely(f6i->fib6_flags & RTF_REJECT)) { - switch (f6i->fib6_type) { - case RTN_BLACKHOLE: - return BPF_FIB_LKUP_RET_BLACKHOLE; - case RTN_UNREACHABLE: - return BPF_FIB_LKUP_RET_UNREACHABLE; - case RTN_PROHIBIT: - return BPF_FIB_LKUP_RET_PROHIBIT; - default: - return BPF_FIB_LKUP_RET_NOT_FWDED; - } - } - - if (f6i->fib6_type != RTN_UNICAST) + switch (res.fib6_type) { + /* only unicast is forwarded */ + case RTN_UNICAST: + break; + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: return BPF_FIB_LKUP_RET_NOT_FWDED; + } - if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) - f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, - fl6.flowi6_oif, NULL, - strict); + ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, + fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { - mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) return BPF_FIB_LKUP_RET_FRAG_NEEDED; } - if (f6i->fib6_nh.nh_lwtstate) + if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; - if (f6i->fib6_flags & RTF_GATEWAY) - *dst = f6i->fib6_nh.nh_gw; + if (res.nh->fib_nh_gw_family) + *dst = res.nh->fib_nh_gw6; - dev = f6i->fib6_nh.nh_dev; - params->rt_metric = f6i->fib6_metric; + dev = res.nh->fib_nh_dev; + params->rt_metric = res.f6i->fib6_metric; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is - * not needed here. Can not use __ipv6_neigh_lookup_noref here - * because we need to get nd_tbl via the stub + * not needed here. */ - neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, - ndisc_hashfn, dst, dev); + neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; @@ -5156,15 +5289,15 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, return sk; } -/* bpf_sk_lookup performs the core lookup for different types of sockets, +/* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. * Returns the socket as an 'unsigned long' to simplify the casting in the * callers to satisfy BPF_CALL declarations. */ -static unsigned long -__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, - u64 flags) +static struct sock * +__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { struct sock *sk = NULL; u8 family = AF_UNSPEC; @@ -5192,15 +5325,27 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, put_net(net); } +out: + return sk; +} + +static struct sock * +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) +{ + struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, + ifindex, proto, netns_id, flags); + if (sk) sk = sk_to_full_sk(sk); -out: - return (unsigned long) sk; + + return sk; } -static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +static struct sock * +bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; @@ -5213,14 +5358,47 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex = 0; } - return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, - proto, netns_id, flags); + return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, + netns_id, flags); +} + +static struct sock * +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, + flags); + + if (sk) + sk = sk_to_full_sk(sk); + + return sk; } +BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); +} + +static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { + .func = bpf_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { @@ -5238,7 +5416,8 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); + return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { @@ -5273,8 +5452,9 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_UDP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { @@ -5289,14 +5469,38 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); +} + +static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { + .func = bpf_xdp_skc_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net *caller_net = dev_net(ctx->rxq->dev); int ifindex = ctx->rxq->dev->ifindex; - return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, + ifindex, IPPROTO_TCP, netns_id, + flags); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { @@ -5311,11 +5515,31 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { + .func = bpf_sock_addr_skc_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_TCP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_TCP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { @@ -5332,8 +5556,9 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { - return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, - IPPROTO_UDP, netns_id, flags); + return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, + sock_net(ctx->sk), 0, IPPROTO_UDP, + netns_id, flags); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { @@ -5461,6 +5686,74 @@ static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, + struct tcphdr *, th, u32, th_len) +{ +#ifdef CONFIG_SYN_COOKIES + u32 cookie; + int ret; + + if (unlikely(th_len < sizeof(*th))) + return -EINVAL; + + /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ + if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) + return -EINVAL; + + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + return -EINVAL; + + if (!th->ack || th->rst || th->syn) + return -ENOENT; + + if (tcp_synq_no_recent_overflow(sk)) + return -ENOENT; + + cookie = ntohl(th->ack_seq) - 1; + + switch (sk->sk_family) { + case AF_INET: + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + + ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); + break; + +#if IS_BUILTIN(CONFIG_IPV6) + case AF_INET6: + if (unlikely(iph_len < sizeof(struct ipv6hdr))) + return -EINVAL; + + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); + break; +#endif /* CONFIG_IPV6 */ + + default: + return -EPROTONOSUPPORT; + } + + if (ret > 0) + return 0; + + return -ENOENT; +#else + return -ENOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { + .func = bpf_tcp_check_syncookie, + .gpl_only = true, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5586,6 +5879,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); @@ -5609,6 +5904,9 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; + static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5617,6 +5915,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; @@ -5698,6 +6000,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_fib_lookup_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; @@ -5719,6 +6025,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5754,6 +6066,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_xdp_skc_lookup_tcp_proto; + case BPF_FUNC_tcp_check_syncookie: + return &bpf_tcp_check_syncookie_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5846,6 +6162,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_base_func_proto(func_id); @@ -5857,7 +6175,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; + return &bpf_flow_dissector_load_bytes_proto; default: return bpf_base_func_proto(func_id); } @@ -5984,9 +6302,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): - if (size != sizeof(__u64)) - return false; - break; + return false; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; @@ -6021,7 +6337,6 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): @@ -6048,7 +6363,6 @@ static bool cg_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): @@ -6094,7 +6408,6 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): return false; @@ -6337,7 +6650,6 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -6539,7 +6851,6 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): return false; @@ -6613,34 +6924,65 @@ static bool flow_dissector_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) { - switch (off) { - case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): - break; - default: - return false; - } - } + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + if (type == BPF_WRITE) + return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): + if (size != size_default) + return false; info->reg_type = PTR_TO_PACKET; - break; + return true; case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) + return false; info->reg_type = PTR_TO_PACKET_END; - break; + return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + if (size != sizeof(__u64)) + return false; info->reg_type = PTR_TO_FLOW_KEYS; - break; - case bpf_ctx_range(struct __sk_buff, tc_classid): - case bpf_ctx_range(struct __sk_buff, data_meta): - case bpf_ctx_range_till(struct __sk_buff, family, local_port): - case bpf_ctx_range(struct __sk_buff, tstamp): - case bpf_ctx_range(struct __sk_buff, wire_len): + return true; + default: return false; } +} - return bpf_skb_is_valid_access(off, size, type, prog, info); +static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) + +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data)); + break; + + case offsetof(struct __sk_buff, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, data_end)); + break; + + case offsetof(struct __sk_buff, flow_keys): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), + si->dst_reg, si->src_reg, + offsetof(struct bpf_flow_dissector, flow_keys)); + break; + } + + return insn - insn_buf; } static u32 bpf_convert_ctx_access(enum bpf_access_type type, @@ -6947,15 +7289,6 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, skc_num, 2, target_size)); break; - case offsetof(struct __sk_buff, flow_keys): - off = si->off; - off -= offsetof(struct __sk_buff, flow_keys); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct qdisc_skb_cb, flow_keys); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); - break; - case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); @@ -7960,7 +8293,7 @@ const struct bpf_prog_ops sk_msg_prog_ops = { const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, - .convert_ctx_access = bpf_convert_ctx_access, + .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bb1a54747d64..548f39dde307 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -65,6 +65,45 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->flow_dissector_prog); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { @@ -683,48 +722,32 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool __skb_flow_bpf_dissect(struct bpf_prog *prog, - const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - struct bpf_flow_keys *flow_keys) +bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen) { - struct bpf_skb_data_end cb_saved; - struct bpf_skb_data_end *cb; + struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; - /* Note that even though the const qualifier is discarded - * throughout the execution of the BPF program, all changes(the - * control block) are reverted after the BPF program returns. - * Therefore, __skb_flow_dissect does not alter the skb. - */ - - cb = (struct bpf_skb_data_end *)skb->cb; - - /* Save Control Block */ - memcpy(&cb_saved, cb, sizeof(cb_saved)); - memset(cb, 0, sizeof(*cb)); - /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); - cb->qdisc_cb.flow_keys = flow_keys; - flow_keys->nhoff = skb_network_offset(skb); + flow_keys->n_proto = proto; + flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; - bpf_compute_data_pointers((struct sk_buff *)skb); - result = BPF_PROG_RUN(prog, skb); + preempt_disable(); + result = BPF_PROG_RUN(prog, ctx); + preempt_enable(); - /* Restore state */ - memcpy(cb, &cb_saved, sizeof(cb_saved)); - - flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); + flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, - flow_keys->nhoff, skb->len); + flow_keys->nhoff, hlen); return result == BPF_OK; } /** * __skb_flow_dissect - extract the flow_keys struct and return it + * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into @@ -732,6 +755,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * @flags: flags that control the dissection process, e.g. + * FLOW_DISSECTOR_F_STOP_AT_L3. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the @@ -739,7 +764,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, * * Caller must take care of zeroing target container memory. */ -bool __skb_flow_dissect(const struct sk_buff *skb, +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, void *data, __be16 proto, int nhoff, int hlen, @@ -752,6 +778,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; + struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; int num_hdrs = 0; @@ -794,22 +821,39 @@ bool __skb_flow_dissect(const struct sk_buff *skb, target_container); if (skb) { - struct bpf_flow_keys flow_keys; - struct bpf_prog *attached = NULL; + if (!net) { + if (skb->dev) + net = dev_net(skb->dev); + else if (skb->sk) + net = sock_net(skb->sk); + } + } + WARN_ON_ONCE(!net); + if (net) { rcu_read_lock(); - - if (skb->dev) - attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); - else if (skb->sk) - attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); - else - WARN_ON_ONCE(1); + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { - ret = __skb_flow_bpf_dissect(attached, skb, - flow_dissector, - &flow_keys); + struct bpf_flow_keys flow_keys; + struct bpf_flow_dissector ctx = { + .flow_keys = &flow_keys, + .data = data, + .data_end = data + hlen, + }; + __be16 n_proto = proto; + + if (skb) { + ctx.skb = skb; + /* we can't use 'proto' in the skb case + * because it might be set to skb->vlan_proto + * which has been pulled from the data + */ + n_proto = skb->protocol; + } + + ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, + hlen); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); @@ -1406,8 +1450,8 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb) __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); - __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, - NULL, 0, 0, 0, + __skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric, + &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(&keys, hashrnd); @@ -1508,7 +1552,8 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ac679f74ba47..9bf1b9ad1780 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -291,6 +291,7 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + qstats->qlen = 0; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -306,6 +307,7 @@ void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); } else { + qstats->qlen = q->qlen; qstats->backlog = q->backlog; qstats->drops = q->drops; qstats->requeues = q->requeues; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 126d31ff5ee3..1c94f529f4a1 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -18,6 +18,7 @@ #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip6_route.h> +#include <net/ipv6_stubs.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -342,8 +343,8 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, int ret; u32 fd; - ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, - NULL); + ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, + bpf_prog_policy, NULL); if (ret < 0) return ret; @@ -384,7 +385,8 @@ static int bpf_build_state(struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); + ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, + extack); if (ret < 0) return ret; @@ -452,7 +454,7 @@ static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, if (!prog->prog) return 0; - nest = nla_nest_start(skb, attr); + nest = nla_nest_start_noflag(skb, attr); if (!nest) return -EMSGSIZE; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 19b557bd294b..69e249fbc02f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -26,7 +26,7 @@ #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #ifdef CONFIG_MODULES @@ -223,7 +223,8 @@ void lwtstate_free(struct lwtunnel_state *lws) } EXPORT_SYMBOL_GPL(lwtstate_free); -int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; @@ -236,7 +237,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; - nest = nla_nest_start(skb, RTA_ENCAP); + nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; @@ -250,7 +251,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) if (ret) goto nla_put_failure; nla_nest_end(skb, nest); - ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 30f6fd8f68e0..dfa871061f14 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -31,6 +31,7 @@ #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> +#include <net/arp.h> #include <net/dst.h> #include <net/sock.h> #include <net/netevent.h> @@ -663,6 +664,8 @@ out: out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: + if (!exempt_from_gc) + atomic_dec(&tbl->gc_entries); neigh_release(n); goto out; } @@ -1862,7 +1865,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nda_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + nda_policy, extack); if (err < 0) goto out; @@ -1920,6 +1924,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + if (tbl->allow_add && !tbl->allow_add(dev, extack)) { + err = -EINVAL; + goto out; + } + neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { bool exempt_from_gc; @@ -1974,7 +1983,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; - nest = nla_nest_start(skb, NDTA_PARMS); + nest = nla_nest_start_noflag(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; @@ -2176,8 +2185,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, bool found = false; int err, tidx; - err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, - nl_neightbl_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy, extack); if (err < 0) goto errout; @@ -2214,8 +2223,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct neigh_parms *p; int i, ifindex = 0; - err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], - nl_ntbl_parm_policy, extack); + err = nla_parse_nested_deprecated(tbp, NDTPA_MAX, + tb[NDTA_PARMS], + nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; @@ -2655,11 +2665,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - nda_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), + tb, NDA_MAX, nda_policy, + extack); } else { - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - nda_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); } if (err < 0) return err; @@ -2759,8 +2770,8 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - nda_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); if (err < 0) return err; @@ -2982,7 +2993,13 @@ int neigh_xmit(int index, struct net_device *dev, if (!tbl) goto out; rcu_read_lock_bh(); - neigh = __neigh_lookup_noref(tbl, addr, dev); + if (index == NEIGH_ARP_TABLE) { + u32 key = *((u32 *)addr); + + neigh = __ipv4_neigh_lookup_noref(dev, key); + } else { + neigh = __neigh_lookup_noref(tbl, addr, dev); + } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 63881f72ef71..36347933ec3a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -258,7 +258,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v) else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s %pf\n", + seq_printf(seq, " %-8s %ps\n", pt->dev ? pt->dev->name : "", pt->func); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f8f94303a1f5..d9c4360257ce 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -754,9 +754,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, rcu_assign_pointer(queue->rps_map, map); if (map) - static_key_slow_inc(&rps_needed); + static_branch_inc(&rps_needed); if (old_map) - static_key_slow_dec(&rps_needed); + static_branch_dec(&rps_needed); mutex_unlock(&rps_map_mutex); @@ -863,6 +863,7 @@ static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #endif NULL }; +ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { @@ -911,7 +912,7 @@ static void rx_queue_get_ownership(struct kobject *kobj, static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_attrs = rx_queue_default_attrs, + .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1416,6 +1417,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { #endif NULL }; +ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { @@ -1448,7 +1450,7 @@ static void netdev_queue_get_ownership(struct kobject *kobj, static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_attrs = netdev_queue_default_attrs, + .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1747,20 +1749,16 @@ int netdev_register_kobject(struct net_device *ndev) error = device_add(dev); if (error) - goto error_put_device; + return error; error = register_queue_kobjects(ndev); - if (error) - goto error_device_del; + if (error) { + device_del(dev); + return error; + } pm_runtime_set_memalloc_noio(dev, true); - return 0; - -error_device_del: - device_del(dev); -error_put_device: - put_device(dev); return error; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 17f36317363d..711b161505ac 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -304,6 +304,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->count, 1); refcount_set(&net->passive, 1); + get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -681,8 +682,8 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *peer; int nsid, err; - err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; if (!tb[NETNSA_NSID]) { @@ -786,11 +787,13 @@ static int rtnl_net_valid_getid_req(struct sk_buff *skb, int i, err; if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), + tb, NETNSA_MAX, rtnl_net_policy, + extack); - err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); if (err) return err; @@ -838,7 +841,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { - peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; @@ -928,8 +931,9 @@ static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, struct nlattr *tb[NETNSA_MAX + 1]; int err, i; - err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy, + extack); if (err < 0) return err; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 361aabffb8c0..a0f05416657b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -149,7 +149,7 @@ static void poll_one_napi(struct napi_struct *napi) * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); - WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll); + WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); @@ -323,7 +323,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_pick_tx(dev, skb, NULL); + txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; @@ -346,7 +346,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b9057478d69c..7e3d0d99dfae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -301,6 +301,4 @@ static int __init init_cgroup_netprio(void) register_netdevice_notifier(&netprio_device_notifier); return 0; } - subsys_initcall(init_cgroup_netprio); -MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f3f5a78cd062..319ad5490fb3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2521,7 +2521,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); - err = x->outer_mode->output(x, skb); + err = pktgen_xfrm_outer_mode_output(x, skb); rcu_read_unlock_bh(); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index 703cf76aa7c2..7109c168b5e0 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -185,9 +185,10 @@ void __init ptp_classifier_init(void) { 0x16, 0, 0, 0x00000000 }, { 0x06, 0, 0, 0x00000000 }, }; - struct sock_fprog_kern ptp_prog = { - .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter, - }; + struct sock_fprog_kern ptp_prog; + + ptp_prog.len = ARRAY_SIZE(ptp_filter); + ptp_prog.filter = ptp_filter; BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog)); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a51cab95ba64..2bd12afb9297 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -634,7 +634,7 @@ static int rtnl_link_slave_info_fill(struct sk_buff *skb, if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { - slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA); + slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); @@ -666,7 +666,7 @@ static int rtnl_link_info_fill(struct sk_buff *skb, return err; } if (ops->fill_info) { - data = nla_nest_start(skb, IFLA_INFO_DATA); + data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); @@ -686,7 +686,7 @@ static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) struct nlattr *linkinfo; int err = -EMSGSIZE; - linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; @@ -755,7 +755,7 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) struct nlattr *mx; int i, valid = 0; - mx = nla_nest_start(skb, RTA_METRICS); + mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; @@ -1036,12 +1036,12 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) int vf; int err; - vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { - vf_port = nla_nest_start(skb, IFLA_VF_PORT); + vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) @@ -1070,7 +1070,7 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) struct nlattr *port_self; int err; - port_self = nla_nest_start(skb, IFLA_PORT_SELF); + port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; @@ -1247,7 +1247,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; - vf = nla_nest_start(skb, IFLA_VF_INFO); + vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || @@ -1266,7 +1266,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; - vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), @@ -1279,7 +1279,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); - vfstats = nla_nest_start(skb, IFLA_VF_STATS); + vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, @@ -1329,7 +1329,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, if (!dev->netdev_ops->ndo_get_vf_config) return 0; - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; @@ -1414,7 +1414,7 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) int err; u8 mode; - xdp = nla_nest_start(skb, IFLA_XDP); + xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; @@ -1541,7 +1541,7 @@ static int rtnl_fill_link_af(struct sk_buff *skb, const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; - af_spec = nla_nest_start(skb, IFLA_AF_SPEC); + af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; @@ -1552,7 +1552,7 @@ static int rtnl_fill_link_af(struct sk_buff *skb, if (!af_ops->fill_link_af) continue; - af = nla_nest_start(skb, af_ops->family); + af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; @@ -1797,8 +1797,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; - if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, - ifla_info_policy, NULL) < 0) + if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { @@ -1897,8 +1896,9 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, return -EINVAL; } - return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, - ifla_policy, extack); + return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, + IFLA_MAX, ifla_policy, + extack); } /* A hack to preserve kernel<->userspace interface. @@ -1911,7 +1911,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); + return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, + extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) @@ -2019,7 +2020,8 @@ out_err: int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, struct netlink_ext_ack *exterr) { - return nla_parse(tb, IFLA_MAX, head, len, ifla_policy, exterr); + return nla_parse_deprecated(tb, IFLA_MAX, head, len, ifla_policy, + exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifla); @@ -2564,8 +2566,10 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } - err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr, - ifla_vf_policy, NULL); + err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, + attr, + ifla_vf_policy, + NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); @@ -2592,8 +2596,10 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } - err = nla_parse_nested(port, IFLA_PORT_MAX, attr, - ifla_port_policy, NULL); + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + attr, + ifla_port_policy, + NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -2612,9 +2618,9 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; - err = nla_parse_nested(port, IFLA_PORT_MAX, - tb[IFLA_PORT_SELF], ifla_port_policy, - NULL); + err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], + ifla_port_policy, NULL); if (err < 0) goto errout; @@ -2661,8 +2667,9 @@ static int do_setlink(const struct sk_buff *skb, struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; - err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], - ifla_xdp_policy, NULL); + err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, + tb[IFLA_XDP], + ifla_xdp_policy, NULL); if (err < 0) goto errout; @@ -2716,8 +2723,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[IFLA_MAX+1]; char ifname[IFNAMSIZ]; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) goto errout; @@ -2813,7 +2820,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int err; int netnsid = -1; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) return err; @@ -2990,7 +2998,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #ifdef CONFIG_MODULES replay: #endif - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) return err; @@ -3024,9 +3033,9 @@ replay: return err; if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], ifla_info_policy, - NULL); + err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], + ifla_info_policy, NULL); if (err < 0) return err; } else @@ -3046,9 +3055,9 @@ replay: return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); + err = nla_parse_nested_deprecated(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); if (err < 0) return err; data = attr; @@ -3067,9 +3076,11 @@ replay: if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, extack); + err = nla_parse_nested_deprecated(slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, + extack); if (err < 0) return err; slave_data = slave_attr; @@ -3250,8 +3261,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, - extack); + return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || @@ -3260,8 +3271,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, - extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); if (err) return err; @@ -3366,7 +3377,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { + if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } @@ -3569,7 +3580,7 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -/** +/* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, @@ -3639,7 +3650,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, u16 vid; int err; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, + extack); if (err < 0) return err; @@ -3708,7 +3720,7 @@ out: return err; } -/** +/* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, @@ -3749,7 +3761,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, + extack); if (err < 0) return err; @@ -3847,8 +3860,11 @@ skip: /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. - * @nlh: netlink message header + * @skb: socket buffer to store message in + * @cb: netlink callback * @dev: netdevice + * @filter_dev: ignored + * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. @@ -3895,8 +3911,8 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - NULL, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, NULL, extack); if (err < 0) return err; @@ -3948,8 +3964,9 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; - err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); if (err < 0) { return -EINVAL; } else if (err == 0) { @@ -4088,8 +4105,8 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, - nda_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, + NDA_MAX, nda_policy, extack); if (err < 0) return err; @@ -4270,7 +4287,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; - br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); + br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; @@ -4294,7 +4311,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, } nla_nest_end(skb, br_afspec); - protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); + protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; @@ -4351,11 +4368,14 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, + sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); } else { - err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, extack); + err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), + tb, IFLA_MAX, ifla_policy, + extack); } if (err < 0) return err; @@ -4773,8 +4793,8 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; - attr = nla_nest_start(skb, - IFLA_STATS_LINK_XSTATS); + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS); if (!attr) goto nla_put_failure; @@ -4796,8 +4816,8 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; - attr = nla_nest_start(skb, - IFLA_STATS_LINK_XSTATS_SLAVE); + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) goto nla_put_failure; @@ -4812,7 +4832,8 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; - attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + attr = nla_nest_start_noflag(skb, + IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) goto nla_put_failure; @@ -4831,7 +4852,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; - attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC); + attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) goto nla_put_failure; @@ -4841,7 +4862,8 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlattr *af; int err; - af = nla_nest_start(skb, af_ops->family); + af = nla_nest_start_noflag(skb, + af_ops->family); if (!af) { rcu_read_unlock(); goto nla_put_failure; @@ -4948,7 +4970,7 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < sizeof(*ifsm)) { + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2415d9cb9b89..e89be6282693 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -77,6 +77,8 @@ #include <linux/capability.h> #include <linux/user_namespace.h> +#include "datagram.h" + struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; #ifdef CONFIG_SKB_EXTENSIONS @@ -256,6 +258,33 @@ nodata: } EXPORT_SYMBOL(__alloc_skb); +/* Caller must provide SKB that is memset cleared */ +static struct sk_buff *__build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + struct skb_shared_info *shinfo; + unsigned int size = frag_size ? : ksize(data); + + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + /* Assumes caller memset cleared SKB */ + skb->truesize = SKB_TRUESIZE(size); + refcount_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + return skb; +} + /** * __build_skb - build a network buffer * @data: data buffer provided by caller @@ -277,32 +306,15 @@ EXPORT_SYMBOL(__alloc_skb); */ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { - struct skb_shared_info *shinfo; struct sk_buff *skb; - unsigned int size = frag_size ? : ksize(data); skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); - if (!skb) + if (unlikely(!skb)) return NULL; - size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = SKB_TRUESIZE(size); - refcount_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; - skb->mac_header = (typeof(skb->mac_header))~0U; - skb->transport_header = (typeof(skb->transport_header))~0U; - - /* make sure we initialize shinfo sequentially */ - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - return skb; + return __build_skb_around(skb, data, frag_size); } /* build_skb() is wrapper over __build_skb(), that specifically @@ -323,6 +335,29 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(build_skb); +/** + * build_skb_around - build a network buffer around provided skb + * @skb: sk_buff provide by caller, must be memset cleared + * @data: data buffer provided by caller + * @frag_size: size of data, or 0 if head was kmalloced + */ +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size) +{ + if (unlikely(!skb)) + return NULL; + + skb = __build_skb_around(skb, data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (page_is_pfmemalloc(virt_to_head_page(data))) + skb->pfmemalloc = 1; + } + return skb; +} +EXPORT_SYMBOL(build_skb_around); + #define NAPI_SKB_CACHE_SIZE 64 struct napi_alloc_cache { @@ -1105,9 +1140,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); -extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); - int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); @@ -3801,7 +3833,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int delta_truesize; struct sk_buff *lp; - if (unlikely(p->len + len >= 65536)) + if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) return -E2BIG; lp = NAPI_GRO_CB(p)->last; @@ -5083,7 +5115,8 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { - int mac_len; + int mac_len, meta_len; + void *meta; if (skb_cow(skb, skb_headroom(skb)) < 0) { kfree_skb(skb); @@ -5095,6 +5128,13 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), mac_len - VLAN_HLEN - ETH_TLEN); } + + meta_len = skb_metadata_len(skb); + if (meta_len) { + meta = skb_metadata_end(skb) - meta_len; + memmove(meta + VLAN_HLEN, meta, meta_len); + } + skb->mac_header += VLAN_HLEN; return skb; } diff --git a/net/core/sock.c b/net/core/sock.c index 782343bb925b..75b1c950b49f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -137,6 +137,7 @@ #include <linux/filter.h> #include <net/sock_reuseport.h> +#include <net/bpf_sk_storage.h> #include <trace/events/sock.h> @@ -348,7 +349,7 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; } - if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; *(struct old_timeval32 *)optval = tv32; return sizeof(tv32); @@ -372,7 +373,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool { struct __kernel_sock_timeval tv; - if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; if (optlen < sizeof(tv32)) @@ -1709,6 +1710,10 @@ static void __sk_destruct(struct rcu_head *head) sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); @@ -2977,39 +2982,44 @@ bool lock_sock_fast(struct sock *sk) } EXPORT_SYMBOL(lock_sock_fast); -int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32) { - struct timeval tv; + struct sock *sk = sock->sk; + struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sock_read_timestamp(sk)); - if (tv.tv_sec == -1) + ts = ktime_to_timespec64(sock_read_timestamp(sk)); + if (ts.tv_sec == -1) return -ENOENT; - if (tv.tv_sec == 0) { + if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - tv = ktime_to_timeval(kt); + sock_write_timestamp(sk, kt);; + ts = ktime_to_timespec64(kt); } - return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; -} -EXPORT_SYMBOL(sock_get_timestamp); -int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct timespec ts; + if (timeval) + ts.tv_nsec /= 1000; - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sock_read_timestamp(sk)); - if (ts.tv_sec == -1) - return -ENOENT; - if (ts.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - ts = ktime_to_timespec(sk->sk_stamp); +#ifdef CONFIG_COMPAT_32BIT_TIME + if (time32) + return put_old_timespec32(&ts, userstamp); +#endif +#ifdef CONFIG_SPARC64 + /* beware of padding in sparc64 timeval */ + if (timeval && !in_compat_syscall()) { + struct __kernel_old_timeval __user tv = { + .tv_sec = ts.tv_sec, + .tv_usec = ts.tv_nsec, + }; + if (copy_to_user(userstamp, &tv, sizeof(tv))) + return -EFAULT; + return 0; } - return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +#endif + return put_timespec64(&ts, userstamp); } -EXPORT_SYMBOL(sock_get_timestampns); +EXPORT_SYMBOL(sock_gettstamp); void sock_enable_timestamp(struct sock *sk, int flag) { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index d8fe3e549373..dc4aefdf2a08 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -144,6 +144,8 @@ static void reuseport_free_rcu(struct rcu_head *head) * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. * @sk2: Socket belonging to the existing reuseport group. + * @bind_inany: Whether or not the group is bound to a local INANY address. + * * May return ENOMEM and not add socket to group under memory pressure. */ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 84bf2861f45f..1a2685694abd 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -95,12 +95,12 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) { - static_key_slow_inc(&rps_needed); - static_key_slow_inc(&rfs_needed); + static_branch_inc(&rps_needed); + static_branch_inc(&rfs_needed); } if (orig_sock_table) { - static_key_slow_dec(&rps_needed); - static_key_slow_dec(&rfs_needed); + static_branch_dec(&rps_needed); + static_branch_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index a556cd708885..ceff9d22deea 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -241,12 +241,13 @@ static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, - tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, + tb[DCB_ATTR_PFC_CFG], + dcbnl_pfc_up_nest, NULL); if (ret) return ret; - nest = nla_nest_start(skb, DCB_ATTR_PFC_CFG); + nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; @@ -299,12 +300,13 @@ static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], - dcbnl_cap_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, + tb[DCB_ATTR_CAP], dcbnl_cap_nest, + NULL); if (ret) return ret; - nest = nla_nest_start(skb, DCB_ATTR_CAP); + nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; @@ -343,12 +345,13 @@ static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], - dcbnl_numtcs_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, + tb[DCB_ATTR_NUMTCS], + dcbnl_numtcs_nest, NULL); if (ret) return ret; - nest = nla_nest_start(skb, DCB_ATTR_NUMTCS); + nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; @@ -388,8 +391,9 @@ static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], - dcbnl_numtcs_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, + tb[DCB_ATTR_NUMTCS], + dcbnl_numtcs_nest, NULL); if (ret) return ret; @@ -447,8 +451,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_APP]) return -EINVAL; - ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], - dcbnl_app_nest, NULL); + ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, + tb[DCB_ATTR_APP], dcbnl_app_nest, + NULL); if (ret) return ret; @@ -479,7 +484,7 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, up = dcb_getapp(netdev, &app); } - app_nest = nla_nest_start(skb, DCB_ATTR_APP); + app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; @@ -515,8 +520,9 @@ static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_APP]) return -EINVAL; - ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], - dcbnl_app_nest, NULL); + ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, + tb[DCB_ATTR_APP], dcbnl_app_nest, + NULL); if (ret) return ret; @@ -573,12 +579,13 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; - ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], - dcbnl_pg_nest, NULL); + ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, + tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, + NULL); if (ret) return ret; - pg_nest = nla_nest_start(skb, DCB_ATTR_PG_CFG); + pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; @@ -593,12 +600,13 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; - ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, data, - dcbnl_tc_param_nest, NULL); + ret = nla_parse_nested_deprecated(param_tb, + DCB_TC_ATTR_PARAM_MAX, data, + dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; - param_nest = nla_nest_start(skb, i); + param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; @@ -730,8 +738,9 @@ static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, - tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, + tb[DCB_ATTR_PFC_CFG], + dcbnl_pfc_up_nest, NULL); if (ret) return ret; @@ -786,8 +795,9 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; - ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], - dcbnl_pg_nest, NULL); + ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, + tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, + NULL); if (ret) return ret; @@ -795,8 +805,10 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!pg_tb[i]) continue; - ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, - pg_tb[i], dcbnl_tc_param_nest, NULL); + ret = nla_parse_nested_deprecated(param_tb, + DCB_TC_ATTR_PARAM_MAX, + pg_tb[i], + dcbnl_tc_param_nest, NULL); if (ret) return ret; @@ -884,12 +896,13 @@ static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; - ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], - dcbnl_bcn_nest, NULL); + ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, + tb[DCB_ATTR_BCN], dcbnl_bcn_nest, + NULL); if (ret) return ret; - bcn_nest = nla_nest_start(skb, DCB_ATTR_BCN); + bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; @@ -943,8 +956,9 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], - dcbnl_pfc_up_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, + tb[DCB_ATTR_BCN], dcbnl_pfc_up_nest, + NULL); if (ret) return ret; @@ -1002,7 +1016,7 @@ static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, */ err = -EMSGSIZE; - app = nla_nest_start(skb, app_nested_type); + app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; @@ -1036,7 +1050,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; - ieee = nla_nest_start(skb, DCB_ATTR_IEEE); + ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; @@ -1106,7 +1120,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) return -EMSGSIZE; } - app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE); + app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; @@ -1174,13 +1188,13 @@ static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; - struct nlattr *pg = nla_nest_start(skb, i); + struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { - struct nlattr *tc_nest = nla_nest_start(skb, i); + struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; @@ -1231,7 +1245,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; - cee = nla_nest_start(skb, DCB_ATTR_CEE); + cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; @@ -1250,7 +1264,8 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) /* local pfc */ if (ops->getpfccfg) { - struct nlattr *pfc_nest = nla_nest_start(skb, DCB_ATTR_CEE_PFC); + struct nlattr *pfc_nest = nla_nest_start_noflag(skb, + DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; @@ -1265,14 +1280,14 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) /* local app */ spin_lock_bh(&dcb_lock); - app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE); + app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { - struct nlattr *app_nest = nla_nest_start(skb, - DCB_ATTR_APP); + struct nlattr *app_nest = nla_nest_start_noflag(skb, + DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; @@ -1305,7 +1320,8 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) /* features flags */ if (ops->getfeatcfg) { - struct nlattr *feat = nla_nest_start(skb, DCB_ATTR_CEE_FEAT); + struct nlattr *feat = nla_nest_start_noflag(skb, + DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; @@ -1429,8 +1445,9 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_IEEE]) return -EINVAL; - err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], - dcbnl_ieee_policy, NULL); + err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, + tb[DCB_ATTR_IEEE], + dcbnl_ieee_policy, NULL); if (err) return err; @@ -1529,8 +1546,9 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_IEEE]) return -EINVAL; - err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], - dcbnl_ieee_policy, NULL); + err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, + tb[DCB_ATTR_IEEE], + dcbnl_ieee_policy, NULL); if (err) return err; @@ -1602,12 +1620,13 @@ static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; - ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, - tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, + tb[DCB_ATTR_FEATCFG], + dcbnl_featcfg_nest, NULL); if (ret) return ret; - nest = nla_nest_start(skb, DCB_ATTR_FEATCFG); + nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; @@ -1646,8 +1665,9 @@ static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; - ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, - tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); + ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, + tb[DCB_ATTR_FEATCFG], + dcbnl_featcfg_nest, NULL); if (ret) goto err; @@ -1736,8 +1756,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, - dcbnl_rtnl_policy, extack); + ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, + dcbnl_rtnl_policy, extack); if (ret < 0) return ret; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f227f002c73d..db87d9f58019 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -738,7 +738,12 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; - return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); + if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { + kfree(fval.sp.vec); + return -ENOMEM; + } + + return 0; } /** diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 26a21d97b6b0..004535e4c070 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -991,6 +991,7 @@ static const struct proto_ops inet_dccp_ops = { /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ .poll = dccp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, /* FIXME: work on inet_listen to rename it to sock_common_listen */ .listen = inet_dccp_listen, .shutdown = inet_shutdown, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 57d84e9b7b6f..c4e4d1301062 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1075,6 +1075,7 @@ static const struct proto_ops inet6_dccp_ops = { .getname = inet6_getname, .poll = dccp_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = inet_dccp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0e2f71ab8367..5dd85ec51bfe 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -263,7 +263,6 @@ int dccp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - int err = 0; const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) @@ -307,7 +306,7 @@ int dccp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); - return err; + return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index bdccc46a2921..c1fa4785c4c2 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -444,7 +444,7 @@ static void dn_destruct(struct sock *sk) skb_queue_purge(&scp->other_xmit_queue); skb_queue_purge(&scp->other_receive_queue); - dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); } static unsigned long dn_memory_pressure; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 0962f9201baa..cca7ae712995 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -583,8 +583,8 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!net_eq(net, &init_net)) goto errout; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + dn_ifa_policy, extack); if (err < 0) goto errout; @@ -629,8 +629,8 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + dn_ifa_policy, extack); if (err < 0) return err; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 6cd3737593a6..77fbf8e9df4b 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -42,7 +42,7 @@ #include <net/dn_fib.h> #include <net/dn_neigh.h> #include <net/dn_dev.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #define RT_MIN_TABLE 1 @@ -517,8 +517,8 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX, + rtm_dn_policy, extack); if (err < 0) return err; @@ -544,8 +544,8 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*r), attrs, RTA_MAX, + rtm_dn_policy, extack); if (err < 0) return err; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 950613ee7881..664584763c36 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1651,8 +1651,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_dn_policy, extack); if (err < 0) return err; diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index f0710b5d037d..33fefb0aebca 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -348,7 +348,8 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, struct rtnexthop *nhp; struct nlattr *mp_head; - if (!(mp_head = nla_nest_start(skb, RTA_MULTIPATH))) + mp_head = nla_nest_start_noflag(skb, RTA_MULTIPATH); + if (!mp_head) goto errout; for_nexthops(fi) { diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 76338c38738a..19aa32fc1802 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -94,8 +94,6 @@ int dns_query(const char *type, const char *name, size_t namelen, desclen += typelen + 1; } - if (!namelen) - namelen = strnlen(name, 256); if (namelen < 3 || namelen > 255) return -EINVAL; desclen += namelen + 1; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index fab49132345f..cf855352a440 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -4,60 +4,117 @@ config HAVE_NET_DSA # Drivers must select NET_DSA and the appropriate tagging format -config NET_DSA +menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n select NET_SWITCHDEV select PHYLINK + select NET_DEVLINK ---help--- Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. if NET_DSA -config NET_DSA_LEGACY - bool "Support for older platform device and Device Tree registration" - default y - ---help--- - Say Y if you want to enable support for the older platform device and - deprecated Device Tree binding registration. +# tagging formats +config NET_DSA_TAG_8021Q + tristate "Tag driver for switches using custom 802.1Q VLAN headers" + select VLAN_8021Q + help + Unlike the other tagging protocols, the 802.1Q config option simply + provides helpers for other tagging implementations that might rely on + VLAN in one way or another. It is not a complete solution. - This feature is scheduled for removal in 4.17. + Drivers which use these helpers should select this as dependency. + +config NET_DSA_TAG_BRCM_COMMON + tristate + default n -# tagging formats config NET_DSA_TAG_BRCM - bool + tristate "Tag driver for Broadcom switches using in-frame headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + Broadcom switches which place the tag after the MAC source address. + config NET_DSA_TAG_BRCM_PREPEND - bool + tristate "Tag driver for Broadcom switches using prepended headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + Broadcom switches which places the tag before the Ethernet header + (prepended). + +config NET_DSA_TAG_GSWIP + tristate "Tag driver for Lantiq / Intel GSWIP switches" + help + Say Y or M if you want to enable support for tagging frames for the + Lantiq / Intel GSWIP switches. config NET_DSA_TAG_DSA - bool + tristate "Tag driver for Marvell switches using DSA headers" + help + Say Y or M if you want to enable support for tagging frames for the + Marvell switches which use DSA headers. config NET_DSA_TAG_EDSA - bool + tristate "Tag driver for Marvell switches using EtherType DSA headers" + help + Say Y or M if you want to enable support for tagging frames for the + Marvell switches which use EtherType DSA headers. -config NET_DSA_TAG_GSWIP - bool +config NET_DSA_TAG_MTK + tristate "Tag driver for Mediatek switches" + help + Say Y or M if you want to enable support for tagging frames for + Mediatek switches. + +config NET_DSA_TAG_KSZ_COMMON + tristate + default n config NET_DSA_TAG_KSZ - bool + tristate "Tag driver for Microchip 9893 family of switches" + select NET_DSA_TAG_KSZ_COMMON + help + Say Y if you want to enable support for tagging frames for the + Microchip 9893 family of switches. config NET_DSA_TAG_KSZ9477 - bool - select NET_DSA_TAG_KSZ + tristate "Tag driver for Microchip 9477 family of switches" + select NET_DSA_TAG_KSZ_COMMON + help + Say Y if you want to enable support for tagging frames for the + Microchip 9477 family of switches. -config NET_DSA_TAG_LAN9303 - bool +config NET_DSA_TAG_QCA + tristate "Tag driver for Qualcomm Atheros QCA8K switches" + help + Say Y or M if you want to enable support for tagging frames for + the Qualcomm Atheros QCA8K switches. -config NET_DSA_TAG_MTK - bool +config NET_DSA_TAG_LAN9303 + tristate "Tag driver for SMSC/Microchip LAN9303 family of switches" + help + Say Y or M if you want to enable support for tagging frames for the + SMSC/Microchip LAN9303 family of switches. + +config NET_DSA_TAG_SJA1105 + tristate "Tag driver for NXP SJA1105 switches" + select NET_DSA_TAG_8021Q + help + Say Y or M if you want to enable support for tagging frames with the + NXP SJA1105 switch family. Both the native tagging protocol (which + is only for link-local traffic) as well as non-native tagging (based + on a custom 802.1Q VLAN header) are available. config NET_DSA_TAG_TRAILER - bool - -config NET_DSA_TAG_QCA - bool + tristate "Tag driver for switches using a trailer tag" + help + Say Y or M if you want to enable support for tagging frames at + with a trailed. e.g. Marvell 88E6060. endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 6e721f7a2947..c342f54715ba 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -2,16 +2,16 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o -dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o # tagging formats -dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o -dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o -dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o -dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o -dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o -dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o -dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o -dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o -dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o -dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o +obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o +obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o +obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o +obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o +obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o +obj-$(CONFIG_NET_DSA_TAG_KSZ_COMMON) += tag_ksz.o +obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o +obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o +obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 36de4f2a3366..1fc782fab393 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -27,6 +27,9 @@ #include "dsa_priv.h" +static LIST_HEAD(dsa_tag_drivers_list); +static DEFINE_MUTEX(dsa_tag_drivers_lock); + static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -35,106 +38,103 @@ static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, } static const struct dsa_device_ops none_ops = { + .name = "none", + .proto = DSA_TAG_PROTO_NONE, .xmit = dsa_slave_notag_xmit, .rcv = NULL, }; -const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { -#ifdef CONFIG_NET_DSA_TAG_BRCM - [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND - [DSA_TAG_PROTO_BRCM_PREPEND] = &brcm_prepend_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_GSWIP - [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_KSZ9477 - [DSA_TAG_PROTO_KSZ9477] = &ksz9477_netdev_ops, - [DSA_TAG_PROTO_KSZ9893] = &ksz9893_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_LAN9303 - [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_MTK - [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_QCA - [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops, -#endif - [DSA_TAG_PROTO_NONE] = &none_ops, -}; +DSA_TAG_DRIVER(none_ops); -const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) +static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, + struct module *owner) +{ + dsa_tag_driver->owner = owner; + + mutex_lock(&dsa_tag_drivers_lock); + list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); + mutex_unlock(&dsa_tag_drivers_lock); +} + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, struct module *owner) { - const char *protocol_name[DSA_TAG_LAST] = { -#ifdef CONFIG_NET_DSA_TAG_BRCM - [DSA_TAG_PROTO_BRCM] = "brcm", -#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND - [DSA_TAG_PROTO_BRCM_PREPEND] = "brcm-prepend", -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - [DSA_TAG_PROTO_DSA] = "dsa", -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - [DSA_TAG_PROTO_EDSA] = "edsa", -#endif -#ifdef CONFIG_NET_DSA_TAG_GSWIP - [DSA_TAG_PROTO_GSWIP] = "gswip", -#endif -#ifdef CONFIG_NET_DSA_TAG_KSZ9477 - [DSA_TAG_PROTO_KSZ9477] = "ksz9477", - [DSA_TAG_PROTO_KSZ9893] = "ksz9893", -#endif -#ifdef CONFIG_NET_DSA_TAG_LAN9303 - [DSA_TAG_PROTO_LAN9303] = "lan9303", -#endif -#ifdef CONFIG_NET_DSA_TAG_MTK - [DSA_TAG_PROTO_MTK] = "mtk", -#endif -#ifdef CONFIG_NET_DSA_TAG_QCA - [DSA_TAG_PROTO_QCA] = "qca", -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - [DSA_TAG_PROTO_TRAILER] = "trailer", -#endif - [DSA_TAG_PROTO_NONE] = "none", - }; unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(protocol_name) != DSA_TAG_LAST); + for (i = 0; i < count; i++) + dsa_tag_driver_register(dsa_tag_driver_array[i], owner); +} - for (i = 0; i < ARRAY_SIZE(dsa_device_ops); i++) - if (ops == dsa_device_ops[i]) - return protocol_name[i]; +static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) +{ + mutex_lock(&dsa_tag_drivers_lock); + list_del(&dsa_tag_driver->list); + mutex_unlock(&dsa_tag_drivers_lock); +} +EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); - return protocol_name[DSA_TAG_PROTO_NONE]; +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count) +{ + unsigned int i; + + for (i = 0; i < count; i++) + dsa_tag_driver_unregister(dsa_tag_driver_array[i]); +} +EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); + +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) +{ + return ops->name; }; -const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) +const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) { + struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; + char module_name[128]; + bool found = false; + + snprintf(module_name, 127, "%s%d", DSA_TAG_DRIVER_ALIAS, + tag_protocol); + + request_module(module_name); + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + ops = dsa_tag_driver->ops; + if (ops->proto == tag_protocol) { + found = true; + break; + } + } - if (tag_protocol >= DSA_TAG_LAST) - return ERR_PTR(-EINVAL); - ops = dsa_device_ops[tag_protocol]; + if (found) { + if (!try_module_get(dsa_tag_driver->owner)) + ops = ERR_PTR(-ENOPROTOOPT); + } else { + ops = ERR_PTR(-ENOPROTOOPT); + } - if (!ops) - return ERR_PTR(-ENOPROTOOPT); + mutex_unlock(&dsa_tag_drivers_lock); return ops; } +void dsa_tag_driver_put(const struct dsa_device_ops *ops) +{ + struct dsa_tag_driver *dsa_tag_driver; + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + if (dsa_tag_driver->ops == ops) { + module_put(dsa_tag_driver->owner); + break; + } + } + mutex_unlock(&dsa_tag_drivers_lock); +} + static int dev_is_class(struct device *dev, void *class) { if (dev->class != NULL && !strcmp(dev->class->name, class)) @@ -344,23 +344,28 @@ static int __init dsa_init_module(void) rc = dsa_slave_register_notifier(); if (rc) - return rc; - - rc = dsa_legacy_register(); - if (rc) - return rc; + goto register_notifier_fail; dev_add_pack(&dsa_pack_type); + dsa_tag_driver_register(&DSA_TAG_DRIVER_NAME(none_ops), + THIS_MODULE); + return 0; + +register_notifier_fail: + destroy_workqueue(dsa_owq); + + return rc; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { + dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops)); + dsa_slave_unregister_notifier(); dev_remove_pack(&dsa_pack_type); - dsa_legacy_unregister(); destroy_workqueue(dsa_owq); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c00ee464afc7..3b5f434cad3f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -18,6 +18,7 @@ #include <linux/rtnetlink.h> #include <linux/of.h> #include <linux/of_net.h> +#include <net/devlink.h> #include "dsa_priv.h" @@ -257,14 +258,39 @@ static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) static int dsa_port_setup(struct dsa_port *dp) { + enum devlink_port_flavour flavour; struct dsa_switch *ds = dp->ds; - int err = 0; + struct dsa_switch_tree *dst = ds->dst; + int err; + + if (dp->type == DSA_PORT_TYPE_UNUSED) + return 0; memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); + dp->mac = of_get_mac_address(dp->dn); - if (dp->type != DSA_PORT_TYPE_UNUSED) - err = devlink_port_register(ds->devlink, &dp->devlink_port, - dp->index); + switch (dp->type) { + case DSA_PORT_TYPE_CPU: + flavour = DEVLINK_PORT_FLAVOUR_CPU; + break; + case DSA_PORT_TYPE_DSA: + flavour = DEVLINK_PORT_FLAVOUR_DSA; + break; + case DSA_PORT_TYPE_USER: /* fall-through */ + default: + flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + break; + } + + /* dp->index is used now as port_number. However + * CPU and DSA ports should have separate numbering + * independent from front panel port numbers. + */ + devlink_port_attrs_set(&dp->devlink_port, flavour, + dp->index, false, 0, + (const char *) &dst->index, sizeof(dst->index)); + err = devlink_port_register(ds->devlink, &dp->devlink_port, + dp->index); if (err) return err; @@ -272,13 +298,6 @@ static int dsa_port_setup(struct dsa_port *dp) case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: - /* dp->index is used now as port_number. However - * CPU ports should have separate numbering - * independent from front panel port numbers. - */ - devlink_port_attrs_set(&dp->devlink_port, - DEVLINK_PORT_FLAVOUR_CPU, - dp->index, false, 0); err = dsa_port_link_register_of(dp); if (err) { dev_err(ds->dev, "failed to setup link for port %d.%d\n", @@ -287,13 +306,6 @@ static int dsa_port_setup(struct dsa_port *dp) } break; case DSA_PORT_TYPE_DSA: - /* dp->index is used now as port_number. However - * DSA ports should have separate numbering - * independent from front panel port numbers. - */ - devlink_port_attrs_set(&dp->devlink_port, - DEVLINK_PORT_FLAVOUR_DSA, - dp->index, false, 0); err = dsa_port_link_register_of(dp); if (err) { dev_err(ds->dev, "failed to setup link for port %d.%d\n", @@ -302,9 +314,6 @@ static int dsa_port_setup(struct dsa_port *dp) } break; case DSA_PORT_TYPE_USER: - devlink_port_attrs_set(&dp->devlink_port, - DEVLINK_PORT_FLAVOUR_PHYSICAL, - dp->index, false, 0); err = dsa_slave_create(dp); if (err) dev_err(ds->dev, "failed to create slave for port %d.%d\n", @@ -326,6 +335,8 @@ static void dsa_port_teardown(struct dsa_port *dp) case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: + dsa_tag_driver_put(dp->tag_ops); + /* fall-through */ case DSA_PORT_TYPE_DSA: dsa_port_link_unregister_of(dp); break; @@ -360,14 +371,14 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err) return err; - err = ds->ops->setup(ds); - if (err < 0) - return err; - err = dsa_switch_register_notifier(ds); if (err) return err; + err = ds->ops->setup(ds); + if (err < 0) + return err; + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) @@ -568,13 +579,14 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) enum dsa_tag_protocol tag_protocol; tag_protocol = ds->ops->get_tag_protocol(ds, dp->index); - tag_ops = dsa_resolve_tag_protocol(tag_protocol); + tag_ops = dsa_tag_driver_get(tag_protocol); if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); return PTR_ERR(tag_ops); } dp->type = DSA_PORT_TYPE_CPU; + dp->filter = tag_ops->filter; dp->rcv = tag_ops->rcv; dp->tag_ops = tag_ops; dp->master = master; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 093b7d145eb1..8f1222324646 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -84,22 +84,12 @@ struct dsa_slave_priv { }; /* dsa.c */ -const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); +const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); +void dsa_tag_driver_put(const struct dsa_device_ops *ops); + bool dsa_schedule_work(struct work_struct *work); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); -/* legacy.c */ -#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) -int dsa_legacy_register(void); -void dsa_legacy_unregister(void); -#else -static inline int dsa_legacy_register(void) -{ - return 0; -} - -static inline void dsa_legacy_unregister(void) { } -#endif int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, @@ -169,6 +159,8 @@ int dsa_port_vlan_add(struct dsa_port *dp, struct switchdev_trans *trans); int dsa_port_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan); +int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags); +int dsa_port_vid_del(struct dsa_port *dp, u16 vid); int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); @@ -182,6 +174,8 @@ int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev); + static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -200,34 +194,4 @@ dsa_slave_to_master(const struct net_device *dev) /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); - -/* tag_brcm.c */ -extern const struct dsa_device_ops brcm_netdev_ops; -extern const struct dsa_device_ops brcm_prepend_netdev_ops; - -/* tag_dsa.c */ -extern const struct dsa_device_ops dsa_netdev_ops; - -/* tag_edsa.c */ -extern const struct dsa_device_ops edsa_netdev_ops; - -/* tag_gswip.c */ -extern const struct dsa_device_ops gswip_netdev_ops; - -/* tag_ksz.c */ -extern const struct dsa_device_ops ksz9477_netdev_ops; -extern const struct dsa_device_ops ksz9893_netdev_ops; - -/* tag_lan9303.c */ -extern const struct dsa_device_ops lan9303_netdev_ops; - -/* tag_mtk.c */ -extern const struct dsa_device_ops mtk_netdev_ops; - -/* tag_qca.c */ -extern const struct dsa_device_ops qca_netdev_ops; - -/* tag_trailer.c */ -extern const struct dsa_device_ops trailer_netdev_ops; - #endif diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c deleted file mode 100644 index cb42939db776..000000000000 --- a/net/dsa/legacy.c +++ /dev/null @@ -1,745 +0,0 @@ -/* - * net/dsa/legacy.c - Hardware switch handling - * Copyright (c) 2008-2009 Marvell Semiconductor - * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/device.h> -#include <linux/list.h> -#include <linux/platform_device.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/of.h> -#include <linux/of_mdio.h> -#include <linux/of_platform.h> -#include <linux/of_net.h> -#include <linux/netdevice.h> -#include <linux/sysfs.h> -#include <linux/phy_fixed.h> -#include <linux/etherdevice.h> - -#include "dsa_priv.h" - -/* switch driver registration ***********************************************/ -static DEFINE_MUTEX(dsa_switch_drivers_mutex); -static LIST_HEAD(dsa_switch_drivers); - -void register_switch_driver(struct dsa_switch_driver *drv) -{ - mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&drv->list, &dsa_switch_drivers); - mutex_unlock(&dsa_switch_drivers_mutex); -} -EXPORT_SYMBOL_GPL(register_switch_driver); - -void unregister_switch_driver(struct dsa_switch_driver *drv) -{ - mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&drv->list); - mutex_unlock(&dsa_switch_drivers_mutex); -} -EXPORT_SYMBOL_GPL(unregister_switch_driver); - -static const struct dsa_switch_ops * -dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, - const char **_name, void **priv) -{ - const struct dsa_switch_ops *ret; - struct list_head *list; - const char *name; - - ret = NULL; - name = NULL; - - mutex_lock(&dsa_switch_drivers_mutex); - list_for_each(list, &dsa_switch_drivers) { - const struct dsa_switch_ops *ops; - struct dsa_switch_driver *drv; - - drv = list_entry(list, struct dsa_switch_driver, list); - ops = drv->ops; - - name = ops->probe(parent, host_dev, sw_addr, priv); - if (name != NULL) { - ret = ops; - break; - } - } - mutex_unlock(&dsa_switch_drivers_mutex); - - *_name = name; - - return ret; -} - -/* basic switch operations **************************************************/ -static int dsa_cpu_dsa_setups(struct dsa_switch *ds) -{ - int ret, port; - - for (port = 0; port < ds->num_ports; port++) { - if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) - continue; - - ret = dsa_port_link_register_of(&ds->ports[port]); - if (ret) - return ret; - } - return 0; -} - -static int dsa_switch_setup_one(struct dsa_switch *ds, - struct net_device *master) -{ - const struct dsa_switch_ops *ops = ds->ops; - struct dsa_switch_tree *dst = ds->dst; - struct dsa_chip_data *cd = ds->cd; - bool valid_name_found = false; - int index = ds->index; - struct dsa_port *dp; - int i, ret; - - /* - * Validate supplied switch configuration. - */ - for (i = 0; i < ds->num_ports; i++) { - char *name; - - dp = &ds->ports[i]; - - name = cd->port_names[i]; - if (name == NULL) - continue; - dp->name = name; - - if (!strcmp(name, "cpu")) { - if (dst->cpu_dp) { - netdev_err(master, - "multiple cpu ports?!\n"); - return -EINVAL; - } - dst->cpu_dp = &ds->ports[i]; - dst->cpu_dp->master = master; - dp->type = DSA_PORT_TYPE_CPU; - } else if (!strcmp(name, "dsa")) { - dp->type = DSA_PORT_TYPE_DSA; - } else { - dp->type = DSA_PORT_TYPE_USER; - } - valid_name_found = true; - } - - if (!valid_name_found && i == ds->num_ports) - return -EINVAL; - - /* Make the built-in MII bus mask match the number of ports, - * switch drivers can override this later - */ - ds->phys_mii_mask |= dsa_user_ports(ds); - - /* - * If the CPU connects to this switch, set the switch tree - * tagging protocol to the preferred tagging format of this - * switch. - */ - if (dst->cpu_dp->ds == ds) { - const struct dsa_device_ops *tag_ops; - enum dsa_tag_protocol tag_protocol; - - tag_protocol = ops->get_tag_protocol(ds, dst->cpu_dp->index); - tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(tag_ops)) - return PTR_ERR(tag_ops); - - dst->cpu_dp->tag_ops = tag_ops; - - /* Few copies for faster access in master receive hot path */ - dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->cpu_dp->dst = dst; - } - - memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); - - /* - * Do basic register setup. - */ - ret = ops->setup(ds); - if (ret < 0) - return ret; - - ret = dsa_switch_register_notifier(ds); - if (ret) - return ret; - - if (!ds->slave_mii_bus && ops->phy_read) { - ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); - if (!ds->slave_mii_bus) - return -ENOMEM; - dsa_slave_mii_bus_init(ds); - - ret = mdiobus_register(ds->slave_mii_bus); - if (ret < 0) - return ret; - } - - /* - * Create network devices for physical switch ports. - */ - for (i = 0; i < ds->num_ports; i++) { - ds->ports[i].dn = cd->port_dn[i]; - ds->ports[i].cpu_dp = dst->cpu_dp; - - if (!dsa_is_user_port(ds, i)) - continue; - - ret = dsa_slave_create(&ds->ports[i]); - if (ret < 0) - netdev_err(master, "[%d]: can't create dsa slave device for port %d(%s): %d\n", - index, i, cd->port_names[i], ret); - } - - /* Perform configuration of the CPU and DSA ports */ - ret = dsa_cpu_dsa_setups(ds); - if (ret < 0) - netdev_err(master, "[%d] : can't configure CPU and DSA ports\n", - index); - - return 0; -} - -static struct dsa_switch * -dsa_switch_setup(struct dsa_switch_tree *dst, struct net_device *master, - int index, struct device *parent, struct device *host_dev) -{ - struct dsa_chip_data *cd = dst->pd->chip + index; - const struct dsa_switch_ops *ops; - struct dsa_switch *ds; - int ret; - const char *name; - void *priv; - - /* - * Probe for switch model. - */ - ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); - if (!ops) { - netdev_err(master, "[%d]: could not detect attached switch\n", - index); - return ERR_PTR(-EINVAL); - } - netdev_info(master, "[%d]: detected a %s switch\n", - index, name); - - - /* - * Allocate and initialise switch state. - */ - ds = dsa_switch_alloc(parent, DSA_MAX_PORTS); - if (!ds) - return ERR_PTR(-ENOMEM); - - ds->dst = dst; - ds->index = index; - ds->cd = cd; - ds->ops = ops; - ds->priv = priv; - - ret = dsa_switch_setup_one(ds, master); - if (ret) - return ERR_PTR(ret); - - return ds; -} - -static void dsa_switch_destroy(struct dsa_switch *ds) -{ - int port; - - /* Destroy network devices for physical switch ports. */ - for (port = 0; port < ds->num_ports; port++) { - if (!dsa_is_user_port(ds, port)) - continue; - - if (!ds->ports[port].slave) - continue; - - dsa_slave_destroy(ds->ports[port].slave); - } - - /* Disable configuration of the CPU and DSA ports */ - for (port = 0; port < ds->num_ports; port++) { - if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) - continue; - dsa_port_link_unregister_of(&ds->ports[port]); - } - - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_unregister(ds->slave_mii_bus); - - dsa_switch_unregister_notifier(ds); -} - -/* platform driver init and cleanup *****************************************/ -static int dev_is_class(struct device *dev, void *class) -{ - if (dev->class != NULL && !strcmp(dev->class->name, class)) - return 1; - - return 0; -} - -static struct device *dev_find_class(struct device *parent, char *class) -{ - if (dev_is_class(parent, class)) { - get_device(parent); - return parent; - } - - return device_find_child(parent, class, dev_is_class); -} - -struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) -{ - struct device *d; - - d = dev_find_class(dev, "mdio_bus"); - if (d != NULL) { - struct mii_bus *bus; - - bus = to_mii_bus(d); - put_device(d); - - return bus; - } - - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); - -#ifdef CONFIG_OF -static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, - struct dsa_chip_data *cd, - int chip_index, int port_index, - struct device_node *link) -{ - const __be32 *reg; - int link_sw_addr; - struct device_node *parent_sw; - int len; - - parent_sw = of_get_parent(link); - if (!parent_sw) - return -EINVAL; - - reg = of_get_property(parent_sw, "reg", &len); - if (!reg || (len != sizeof(*reg) * 2)) - return -EINVAL; - - /* - * Get the destination switch number from the second field of its 'reg' - * property, i.e. for "reg = <0x19 1>" sw_addr is '1'. - */ - link_sw_addr = be32_to_cpup(reg + 1); - - if (link_sw_addr >= pd->nr_chips) - return -EINVAL; - - cd->rtable[link_sw_addr] = port_index; - - return 0; -} - -static int dsa_of_probe_links(struct dsa_platform_data *pd, - struct dsa_chip_data *cd, - int chip_index, int port_index, - struct device_node *port, - const char *port_name) -{ - struct device_node *link; - int link_index; - int ret; - - for (link_index = 0;; link_index++) { - link = of_parse_phandle(port, "link", link_index); - if (!link) - break; - - if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) { - ret = dsa_of_setup_routing_table(pd, cd, chip_index, - port_index, link); - if (ret) - return ret; - } - } - return 0; -} - -static void dsa_of_free_platform_data(struct dsa_platform_data *pd) -{ - int i; - int port_index; - - for (i = 0; i < pd->nr_chips; i++) { - port_index = 0; - while (port_index < DSA_MAX_PORTS) { - kfree(pd->chip[i].port_names[port_index]); - port_index++; - } - - /* Drop our reference to the MDIO bus device */ - put_device(pd->chip[i].host_dev); - } - kfree(pd->chip); -} - -static int dsa_of_probe(struct device *dev) -{ - struct device_node *np = dev->of_node; - struct device_node *child, *mdio, *ethernet, *port; - struct mii_bus *mdio_bus, *mdio_bus_switch; - struct net_device *ethernet_dev; - struct dsa_platform_data *pd; - struct dsa_chip_data *cd; - const char *port_name; - int chip_index, port_index; - const unsigned int *sw_addr, *port_reg; - u32 eeprom_len; - int ret; - - mdio = of_parse_phandle(np, "dsa,mii-bus", 0); - if (!mdio) - return -EINVAL; - - mdio_bus = of_mdio_find_bus(mdio); - if (!mdio_bus) - return -EPROBE_DEFER; - - ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) { - ret = -EINVAL; - goto out_put_mdio; - } - - ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) { - ret = -EPROBE_DEFER; - goto out_put_mdio; - } - - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = -ENOMEM; - goto out_put_ethernet; - } - - dev->platform_data = pd; - pd->of_netdev = ethernet_dev; - pd->nr_chips = of_get_available_child_count(np); - if (pd->nr_chips > DSA_MAX_SWITCHES) - pd->nr_chips = DSA_MAX_SWITCHES; - - pd->chip = kcalloc(pd->nr_chips, sizeof(struct dsa_chip_data), - GFP_KERNEL); - if (!pd->chip) { - ret = -ENOMEM; - goto out_free; - } - - chip_index = -1; - for_each_available_child_of_node(np, child) { - int i; - - chip_index++; - cd = &pd->chip[chip_index]; - - cd->of_node = child; - - /* Initialize the routing table */ - for (i = 0; i < DSA_MAX_SWITCHES; ++i) - cd->rtable[i] = DSA_RTABLE_NONE; - - /* When assigning the host device, increment its refcount */ - cd->host_dev = get_device(&mdio_bus->dev); - - sw_addr = of_get_property(child, "reg", NULL); - if (!sw_addr) - continue; - - cd->sw_addr = be32_to_cpup(sw_addr); - if (cd->sw_addr >= PHY_MAX_ADDR) - continue; - - if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) - cd->eeprom_len = eeprom_len; - - mdio = of_parse_phandle(child, "mii-bus", 0); - if (mdio) { - mdio_bus_switch = of_mdio_find_bus(mdio); - if (!mdio_bus_switch) { - ret = -EPROBE_DEFER; - goto out_free_chip; - } - - /* Drop the mdio_bus device ref, replacing the host - * device with the mdio_bus_switch device, keeping - * the refcount from of_mdio_find_bus() above. - */ - put_device(cd->host_dev); - cd->host_dev = &mdio_bus_switch->dev; - } - - for_each_available_child_of_node(child, port) { - port_reg = of_get_property(port, "reg", NULL); - if (!port_reg) - continue; - - port_index = be32_to_cpup(port_reg); - if (port_index >= DSA_MAX_PORTS) - break; - - port_name = of_get_property(port, "label", NULL); - if (!port_name) - continue; - - cd->port_dn[port_index] = port; - - cd->port_names[port_index] = kstrdup(port_name, - GFP_KERNEL); - if (!cd->port_names[port_index]) { - ret = -ENOMEM; - goto out_free_chip; - } - - ret = dsa_of_probe_links(pd, cd, chip_index, - port_index, port, port_name); - if (ret) - goto out_free_chip; - - } - } - - /* The individual chips hold their own refcount on the mdio bus, - * so drop ours */ - put_device(&mdio_bus->dev); - - return 0; - -out_free_chip: - dsa_of_free_platform_data(pd); -out_free: - kfree(pd); - dev->platform_data = NULL; -out_put_ethernet: - put_device(ðernet_dev->dev); -out_put_mdio: - put_device(&mdio_bus->dev); - return ret; -} - -static void dsa_of_remove(struct device *dev) -{ - struct dsa_platform_data *pd = dev->platform_data; - - if (!dev->of_node) - return; - - dsa_of_free_platform_data(pd); - put_device(&pd->of_netdev->dev); - kfree(pd); -} -#else -static inline int dsa_of_probe(struct device *dev) -{ - return 0; -} - -static inline void dsa_of_remove(struct device *dev) -{ -} -#endif - -static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, - struct device *parent, struct dsa_platform_data *pd) -{ - int i; - unsigned configured = 0; - - dst->pd = pd; - - for (i = 0; i < pd->nr_chips; i++) { - struct dsa_switch *ds; - - ds = dsa_switch_setup(dst, dev, i, parent, pd->chip[i].host_dev); - if (IS_ERR(ds)) { - netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n", - i, PTR_ERR(ds)); - continue; - } - - dst->ds[i] = ds; - - ++configured; - } - - /* - * If no switch was found, exit cleanly - */ - if (!configured) - return -EPROBE_DEFER; - - return dsa_master_setup(dst->cpu_dp->master, dst->cpu_dp); -} - -static int dsa_probe(struct platform_device *pdev) -{ - struct dsa_platform_data *pd = pdev->dev.platform_data; - struct net_device *dev; - struct dsa_switch_tree *dst; - int ret; - - if (pdev->dev.of_node) { - ret = dsa_of_probe(&pdev->dev); - if (ret) - return ret; - - pd = pdev->dev.platform_data; - } - - if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL)) - return -EINVAL; - - if (pd->of_netdev) { - dev = pd->of_netdev; - dev_hold(dev); - } else { - dev = dsa_dev_to_net_device(pd->netdev); - } - if (dev == NULL) { - ret = -EPROBE_DEFER; - goto out; - } - - if (dev->dsa_ptr != NULL) { - dev_put(dev); - ret = -EEXIST; - goto out; - } - - dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL); - if (dst == NULL) { - dev_put(dev); - ret = -ENOMEM; - goto out; - } - - platform_set_drvdata(pdev, dst); - - ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); - if (ret) { - dev_put(dev); - goto out; - } - - return 0; - -out: - dsa_of_remove(&pdev->dev); - - return ret; -} - -static void dsa_remove_dst(struct dsa_switch_tree *dst) -{ - int i; - - dsa_master_teardown(dst->cpu_dp->master); - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; - - if (ds) - dsa_switch_destroy(ds); - } - - dev_put(dst->cpu_dp->master); -} - -static int dsa_remove(struct platform_device *pdev) -{ - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); - - dsa_remove_dst(dst); - dsa_of_remove(&pdev->dev); - - return 0; -} - -static void dsa_shutdown(struct platform_device *pdev) -{ -} - -#ifdef CONFIG_PM_SLEEP -static int dsa_suspend(struct device *d) -{ - struct dsa_switch_tree *dst = dev_get_drvdata(d); - int i, ret = 0; - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; - - if (ds != NULL) - ret = dsa_switch_suspend(ds); - } - - return ret; -} - -static int dsa_resume(struct device *d) -{ - struct dsa_switch_tree *dst = dev_get_drvdata(d); - int i, ret = 0; - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; - - if (ds != NULL) - ret = dsa_switch_resume(ds); - } - - return ret; -} -#endif - -static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); - -static const struct of_device_id dsa_of_match_table[] = { - { .compatible = "marvell,dsa", }, - {} -}; -MODULE_DEVICE_TABLE(of, dsa_of_match_table); - -static struct platform_driver dsa_driver = { - .probe = dsa_probe, - .remove = dsa_remove, - .shutdown = dsa_shutdown, - .driver = { - .name = "dsa", - .of_match_table = dsa_of_match_table, - .pm = &dsa_pm_ops, - }, -}; - -int dsa_legacy_register(void) -{ - return platform_driver_register(&dsa_driver); -} - -void dsa_legacy_unregister(void) -{ - platform_driver_unregister(&dsa_driver); -} diff --git a/net/dsa/port.c b/net/dsa/port.c index caeef4c99dc0..ed8ba9daa3ba 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -154,19 +154,67 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) dsa_port_set_state_now(dp, BR_STATE_FORWARDING); } +static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, + bool vlan_filtering) +{ + struct dsa_switch *ds = dp->ds; + int i; + + if (!ds->vlan_filtering_is_global) + return true; + + /* For cases where enabling/disabling VLAN awareness is global to the + * switch, we need to handle the case where multiple bridges span + * different ports of the same switch device and one of them has a + * different setting than what is being requested. + */ + for (i = 0; i < ds->num_ports; i++) { + struct net_device *other_bridge; + + other_bridge = dsa_to_port(ds, i)->bridge_dev; + if (!other_bridge) + continue; + /* If it's the same bridge, it also has same + * vlan_filtering setting => no need to check + */ + if (other_bridge == dp->bridge_dev) + continue; + if (br_vlan_enabled(other_bridge) != vlan_filtering) { + dev_err(ds->dev, "VLAN filtering is a global setting\n"); + return false; + } + } + return true; +} + int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans) { struct dsa_switch *ds = dp->ds; + int err; /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ if (switchdev_trans_ph_prepare(trans)) return 0; - if (ds->ops->port_vlan_filtering) - return ds->ops->port_vlan_filtering(ds, dp->index, - vlan_filtering); + if (!ds->ops->port_vlan_filtering) + return 0; + + if (!dsa_port_can_apply_vlan_filtering(dp, vlan_filtering)) + return -EINVAL; + + if (dsa_port_is_vlan_filtering(dp) == vlan_filtering) + return 0; + + err = ds->ops->port_vlan_filtering(ds, dp->index, + vlan_filtering); + if (err) + return err; + if (ds->vlan_filtering_is_global) + ds->vlan_filtering = vlan_filtering; + else + dp->vlan_filtering = vlan_filtering; return 0; } @@ -322,6 +370,39 @@ int dsa_port_vlan_del(struct dsa_port *dp, return 0; } +int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags) +{ + struct switchdev_obj_port_vlan vlan = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = flags, + .vid_begin = vid, + .vid_end = vid, + }; + struct switchdev_trans trans; + int err; + + trans.ph_prepare = true; + err = dsa_port_vlan_add(dp, &vlan, &trans); + if (err == -EOPNOTSUPP) + return 0; + + trans.ph_prepare = false; + return dsa_port_vlan_add(dp, &vlan, &trans); +} +EXPORT_SYMBOL(dsa_port_vid_add); + +int dsa_port_vid_del(struct dsa_port *dp, u16 vid) +{ + struct switchdev_obj_port_vlan vlan = { + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .vid_begin = vid, + .vid_end = vid, + }; + + return dsa_port_vlan_del(dp, &vlan); +} +EXPORT_SYMBOL(dsa_port_vid_del); + static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) { struct device_node *phy_dn; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 093eef6f2599..9892ca1f6859 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -120,6 +120,9 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); + cancel_work_sync(&dp->xmit_work); + skb_queue_purge(&dp->xmit_queue); + phylink_stop(dp->pl); dsa_port_disable(dp); @@ -379,6 +382,13 @@ static int dsa_slave_get_port_parent_id(struct net_device *dev, struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; + /* For non-legacy ports, devlink is used and it takes + * care of the name generation. This ndo implementation + * should be removed with legacy support. + */ + if (dp->ds->devlink) + return -EOPNOTSUPP; + ppid->id_len = sizeof(dst->index); memcpy(&ppid->id, &dst->index, ppid->id_len); @@ -423,6 +433,24 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, kfree_skb(clone); } +netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) +{ + /* SKB for netpoll still need to be mangled with the protocol-specific + * tag to be successfully transmitted + */ + if (unlikely(netpoll_tx_running(dev))) + return dsa_slave_netpoll_send_skb(dev, skb); + + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + skb->dev = dsa_slave_to_master(dev); + dev_queue_xmit(skb); + + return NETDEV_TX_OK; +} +EXPORT_SYMBOL_GPL(dsa_enqueue_skb); + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -435,6 +463,8 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); + DSA_SKB_CB(skb)->deferred_xmit = false; + /* Identify PTP protocol packets, clone them, and pass them to the * switch driver */ @@ -445,23 +475,37 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) */ nskb = p->xmit(skb, dev); if (!nskb) { - kfree_skb(skb); + if (!DSA_SKB_CB(skb)->deferred_xmit) + kfree_skb(skb); return NETDEV_TX_OK; } - /* SKB for netpoll still need to be mangled with the protocol-specific - * tag to be successfully transmitted - */ - if (unlikely(netpoll_tx_running(dev))) - return dsa_slave_netpoll_send_skb(dev, nskb); + return dsa_enqueue_skb(nskb, dev); +} - /* Queue the SKB for transmission on the parent interface, but - * do not modify its EtherType - */ - nskb->dev = dsa_slave_to_master(dev); - dev_queue_xmit(nskb); +void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); - return NETDEV_TX_OK; + DSA_SKB_CB(skb)->deferred_xmit = true; + + skb_queue_tail(&dp->xmit_queue, skb); + schedule_work(&dp->xmit_work); + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_defer_xmit); + +static void dsa_port_xmit_work(struct work_struct *work) +{ + struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work); + struct dsa_switch *ds = dp->ds; + struct sk_buff *skb; + + if (unlikely(!ds->ops->port_deferred_xmit)) + return; + + while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL) + ds->ops->port_deferred_xmit(ds, dp->index, skb); } /* ethtool operations *******************************************************/ @@ -736,6 +780,13 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev, { struct dsa_port *dp = dsa_slave_to_port(dev); + /* For non-legacy ports, devlink is used and it takes + * care of the name generation. This ndo implementation + * should be removed with legacy support. + */ + if (dp->ds->devlink) + return -EOPNOTSUPP; + if (snprintf(name, len, "p%d", dp->index) >= len) return -EINVAL; @@ -764,27 +815,25 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, struct dsa_mall_tc_entry *mall_tc_entry; __be16 protocol = cls->common.protocol; struct dsa_switch *ds = dp->ds; - struct net_device *to_dev; - const struct tc_action *a; + struct flow_action_entry *act; struct dsa_port *to_dp; int err = -EOPNOTSUPP; if (!ds->ops->port_mirror_add) return err; - if (!tcf_exts_has_one_action(cls->exts)) + if (!flow_offload_has_one_action(&cls->rule->action)) return err; - a = tcf_exts_first_action(cls->exts); + act = &cls->rule->action.entries[0]; - if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) { + if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { struct dsa_mall_mirror_tc_entry *mirror; - to_dev = tcf_mirred_dev(a); - if (!to_dev) + if (!act->dev) return -EINVAL; - if (!dsa_slave_dev_check(to_dev)) + if (!dsa_slave_dev_check(act->dev)) return -EOPNOTSUPP; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); @@ -795,7 +844,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; - to_dp = dsa_slave_to_port(to_dev); + to_dp = dsa_slave_to_port(act->dev); mirror->to_local_port = to_dp->index; mirror->ingress = ingress; @@ -987,13 +1036,6 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct switchdev_obj_port_vlan vlan = { - .vid_begin = vid, - .vid_end = vid, - /* This API only allows programming tagged, non-PVID VIDs */ - .flags = 0, - }; - struct switchdev_trans trans; struct bridge_vlan_info info; int ret; @@ -1010,25 +1052,14 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, return -EBUSY; } - trans.ph_prepare = true; - ret = dsa_port_vlan_add(dp, &vlan, &trans); - if (ret == -EOPNOTSUPP) - return 0; - - trans.ph_prepare = false; - return dsa_port_vlan_add(dp, &vlan, &trans); + /* This API only allows programming tagged, non-PVID VIDs */ + return dsa_port_vid_add(dp, vid, 0); } static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct switchdev_obj_port_vlan vlan = { - .vid_begin = vid, - .vid_end = vid, - /* This API only allows programming tagged, non-PVID VIDs */ - .flags = 0, - }; struct bridge_vlan_info info; int ret; @@ -1045,7 +1076,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, return -EBUSY; } - ret = dsa_port_vlan_del(dp, &vlan); + ret = dsa_port_vid_del(dp, vid); if (ret == -EOPNOTSUPP) ret = 0; @@ -1096,6 +1127,13 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], return dsa_port_fdb_del(dp, addr, vid); } +static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dp->ds->devlink ? &dp->devlink_port : NULL; +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1119,6 +1157,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_get_port_parent_id = dsa_slave_get_port_parent_id, .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, + .ndo_get_devlink_port = dsa_slave_get_devlink_port, }; static struct device_type dsa_type = { @@ -1283,9 +1322,9 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); - if (ret == -ENODEV) { - /* We could not connect to a designated PHY or SFP, so use the - * switch internal MDIO bus instead + if (ret == -ENODEV && ds->slave_mii_bus) { + /* We could not connect to a designated PHY or SFP, so try to + * use the switch internal MDIO bus instead */ ret = dsa_slave_phy_connect(slave_dev, dp->index); if (ret) { @@ -1297,7 +1336,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) } } - return 0; + return ret; } static struct lock_class_key dsa_slave_netdev_xmit_lock_key; @@ -1316,6 +1355,9 @@ int dsa_slave_suspend(struct net_device *slave_dev) if (!netif_running(slave_dev)) return 0; + cancel_work_sync(&dp->xmit_work); + skb_queue_purge(&dp->xmit_queue); + netif_device_detach(slave_dev); rtnl_lock(); @@ -1378,7 +1420,10 @@ int dsa_slave_create(struct dsa_port *port) NETIF_F_HW_VLAN_CTAG_FILTER; slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - eth_hw_addr_inherit(slave_dev, master); + if (!IS_ERR_OR_NULL(port->mac)) + ether_addr_copy(slave_dev->dev_addr, port->mac); + else + eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->min_mtu = 0; @@ -1400,6 +1445,8 @@ int dsa_slave_create(struct dsa_port *port) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); + INIT_WORK(&port->xmit_work, dsa_port_xmit_work); + skb_queue_head_init(&port->xmit_queue); p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index e1fae969aa73..7d8cd9bc0ecc 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -10,6 +10,7 @@ * (at your option) any later version. */ +#include <linux/if_bridge.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/if_vlan.h> @@ -71,6 +72,9 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { + bool unset_vlan_filtering = br_vlan_enabled(info->br); + int err, i; + if (ds->index == info->sw_index && ds->ops->port_bridge_leave) ds->ops->port_bridge_leave(ds, info->port, info->br); @@ -78,6 +82,31 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port, info->br); + /* If the bridge was vlan_filtering, the bridge core doesn't trigger an + * event for changing vlan_filtering setting upon slave ports leaving + * it. That is a good thing, because that lets us handle it and also + * handle the case where the switch's vlan_filtering setting is global + * (not per port). When that happens, the correct moment to trigger the + * vlan_filtering callback is only when the last port left this bridge. + */ + if (unset_vlan_filtering && ds->vlan_filtering_is_global) { + for (i = 0; i < ds->num_ports; i++) { + if (i == info->port) + continue; + if (dsa_to_port(ds, i)->bridge_dev == info->br) { + unset_vlan_filtering = false; + break; + } + } + } + if (unset_vlan_filtering) { + struct switchdev_trans trans = {0}; + + err = dsa_port_vlan_filtering(&ds->ports[info->port], + false, &trans); + if (err && err != EOPNOTSUPP) + return err; + } return 0; } @@ -196,7 +225,7 @@ static int dsa_port_vlan_check(struct dsa_switch *ds, int port, if (!dp->bridge_dev) return err; - /* dsa_slave_vlan_rx_{add,kill}_vid() cannot use the prepare pharse and + /* dsa_slave_vlan_rx_{add,kill}_vid() cannot use the prepare phase and * already checks whether there is an overlapping bridge VLAN entry * with the same VID, so here we only need to check that if we are * adding a bridge VLAN entry there is not an overlapping VLAN device diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c new file mode 100644 index 000000000000..8ae48c7e1e76 --- /dev/null +++ b/net/dsa/tag_8021q.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com> + * + * This module is not a complete tagger implementation. It only provides + * primitives for taggers that rely on 802.1Q VLAN tags to use. The + * dsa_8021q_netdev_ops is registered for API compliance and not used + * directly by callers. + */ +#include <linux/if_bridge.h> +#include <linux/if_vlan.h> + +#include "dsa_priv.h" + +/* Allocating two VLAN tags per port - one for the RX VID and + * the other for the TX VID - see below + */ +#define DSA_8021Q_VID_RANGE (DSA_MAX_SWITCHES * DSA_MAX_PORTS) +#define DSA_8021Q_VID_BASE (VLAN_N_VID - 2 * DSA_8021Q_VID_RANGE - 1) +#define DSA_8021Q_RX_VID_BASE (DSA_8021Q_VID_BASE) +#define DSA_8021Q_TX_VID_BASE (DSA_8021Q_VID_BASE + DSA_8021Q_VID_RANGE) + +/* Returns the VID to be inserted into the frame from xmit for switch steering + * instructions on egress. Encodes switch ID and port ID. + */ +u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port) +{ + return DSA_8021Q_TX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port; +} +EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid); + +/* Returns the VID that will be installed as pvid for this switch port, sent as + * tagged egress towards the CPU port and decoded by the rcv function. + */ +u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) +{ + return DSA_8021Q_RX_VID_BASE + (DSA_MAX_PORTS * ds->index) + port; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); + +/* Returns the decoded switch ID from the RX VID. */ +int dsa_8021q_rx_switch_id(u16 vid) +{ + return ((vid - DSA_8021Q_RX_VID_BASE) / DSA_MAX_PORTS); +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_switch_id); + +/* Returns the decoded port ID from the RX VID. */ +int dsa_8021q_rx_source_port(u16 vid) +{ + return ((vid - DSA_8021Q_RX_VID_BASE) % DSA_MAX_PORTS); +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); + +/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single + * front-panel switch port (here swp0). + * + * Port identification through VLAN (802.1Q) tags has different requirements + * for it to work effectively: + * - On RX (ingress from network): each front-panel port must have a pvid + * that uniquely identifies it, and the egress of this pvid must be tagged + * towards the CPU port, so that software can recover the source port based + * on the VID in the frame. But this would only work for standalone ports; + * if bridged, this VLAN setup would break autonomous forwarding and would + * force all switched traffic to pass through the CPU. So we must also make + * the other front-panel ports members of this VID we're adding, albeit + * we're not making it their PVID (they'll still have their own). + * By the way - just because we're installing the same VID in multiple + * switch ports doesn't mean that they'll start to talk to one another, even + * while not bridged: the final forwarding decision is still an AND between + * the L2 forwarding information (which is limiting forwarding in this case) + * and the VLAN-based restrictions (of which there are none in this case, + * since all ports are members). + * - On TX (ingress from CPU and towards network) we are faced with a problem. + * If we were to tag traffic (from within DSA) with the port's pvid, all + * would be well, assuming the switch ports were standalone. Frames would + * have no choice but to be directed towards the correct front-panel port. + * But because we also want the RX VLAN to not break bridging, then + * inevitably that means that we have to give them a choice (of what + * front-panel port to go out on), and therefore we cannot steer traffic + * based on the RX VID. So what we do is simply install one more VID on the + * front-panel and CPU ports, and profit off of the fact that steering will + * work just by virtue of the fact that there is only one other port that's + * a member of the VID we're tagging the traffic with - the desired one. + * + * So at the end, each front-panel port will have one RX VID (also the PVID), + * the RX VID of all other front-panel ports, and one TX VID. Whereas the CPU + * port will have the RX and TX VIDs of all front-panel ports, and on top of + * that, is also tagged-input and tagged-output (VLAN trunk). + * + * CPU port CPU port + * +-------------+-----+-------------+ +-------------+-----+-------------+ + * | RX VID | | | | TX VID | | | + * | of swp0 | | | | of swp0 | | | + * | +-----+ | | +-----+ | + * | ^ T | | | Tagged | + * | | | | | ingress | + * | +-------+---+---+-------+ | | +-----------+ | + * | | | | | | | | Untagged | + * | | U v U v U v | | v egress | + * | +-----+ +-----+ +-----+ +-----+ | | +-----+ +-----+ +-----+ +-----+ | + * | | | | | | | | | | | | | | | | | | | | + * | |PVID | | | | | | | | | | | | | | | | | | + * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ + * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 + */ +int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) +{ + int upstream = dsa_upstream_port(ds, port); + struct dsa_port *dp = &ds->ports[port]; + struct dsa_port *upstream_dp = &ds->ports[upstream]; + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + u16 tx_vid = dsa_8021q_tx_vid(ds, port); + int i, err; + + /* The CPU port is implicitly configured by + * configuring the front-panel ports + */ + if (!dsa_is_user_port(ds, port)) + return 0; + + /* Add this user port's RX VID to the membership list of all others + * (including itself). This is so that bridging will not be hindered. + * L2 forwarding rules still take precedence when there are no VLAN + * restrictions, so there are no concerns about leaking traffic. + */ + for (i = 0; i < ds->num_ports; i++) { + struct dsa_port *other_dp = &ds->ports[i]; + u16 flags; + + if (i == upstream) + /* CPU port needs to see this port's RX VID + * as tagged egress. + */ + flags = 0; + else if (i == port) + /* The RX VID is pvid on this port */ + flags = BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_PVID; + else + /* The RX VID is a regular VLAN on all others */ + flags = BRIDGE_VLAN_INFO_UNTAGGED; + + if (enabled) + err = dsa_port_vid_add(other_dp, rx_vid, flags); + else + err = dsa_port_vid_del(other_dp, rx_vid); + if (err) { + dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n", + rx_vid, port, err); + return err; + } + } + /* Finally apply the TX VID on this port and on the CPU port */ + if (enabled) + err = dsa_port_vid_add(dp, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED); + else + err = dsa_port_vid_del(dp, tx_vid); + if (err) { + dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", + tx_vid, port, err); + return err; + } + if (enabled) + err = dsa_port_vid_add(upstream_dp, tx_vid, 0); + else + err = dsa_port_vid_del(upstream_dp, tx_vid); + if (err) { + dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n", + tx_vid, upstream, err); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); + +struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, + u16 tpid, u16 tci) +{ + /* skb->data points at skb_mac_header, which + * is fine for vlan_insert_tag. + */ + return vlan_insert_tag(skb, htons(tpid), tci); +} +EXPORT_SYMBOL_GPL(dsa_8021q_xmit); + +struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, + struct packet_type *pt, u16 *tpid, u16 *tci) +{ + struct vlan_ethhdr *tag; + + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN))) + return NULL; + + tag = vlan_eth_hdr(skb); + *tpid = ntohs(tag->h_vlan_proto); + *tci = ntohs(tag->h_vlan_TCI); + + /* skb->data points in the middle of the VLAN tag, + * after tpid and before tci. This is because so far, + * ETH_HLEN (DMAC, SMAC, EtherType) bytes were pulled. + * There are 2 bytes of VLAN tag left in skb->data, and upper + * layers expect the 'real' EtherType to be consumed as well. + * Coincidentally, a VLAN header is also of the same size as + * the number of bytes that need to be pulled. + */ + skb_pull_rcsum(skb, VLAN_HLEN); + + return skb; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rcv); + +static const struct dsa_device_ops dsa_8021q_netdev_ops = { + .name = "8021q", + .proto = DSA_TAG_PROTO_8021Q, + .overhead = VLAN_HLEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_8021Q); + +module_dsa_tag_driver(dsa_8021q_netdev_ops); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 4aa1d368a5ae..9c3114179690 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Broadcom tag support * * Copyright (C) 2014 Broadcom Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/etherdevice.h> @@ -59,6 +55,9 @@ #define BRCM_EG_TC_MASK 0x7 #define BRCM_EG_PID_MASK 0x1f +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) || \ + IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) + static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, struct net_device *dev, unsigned int offset) @@ -143,8 +142,9 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, return skb; } +#endif -#ifdef CONFIG_NET_DSA_TAG_BRCM +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -171,14 +171,19 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, return nskb; } -const struct dsa_device_ops brcm_netdev_ops = { +static const struct dsa_device_ops brcm_netdev_ops = { + .name = "brcm", + .proto = DSA_TAG_PROTO_BRCM, .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, .overhead = BRCM_TAG_LEN, }; + +DSA_TAG_DRIVER(brcm_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM); #endif -#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, struct net_device *dev) { @@ -194,9 +199,27 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); } -const struct dsa_device_ops brcm_prepend_netdev_ops = { +static const struct dsa_device_ops brcm_prepend_netdev_ops = { + .name = "brcm-prepend", + .proto = DSA_TAG_PROTO_BRCM_PREPEND, .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, .overhead = BRCM_TAG_LEN, }; + +DSA_TAG_DRIVER(brcm_prepend_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND); #endif + +static struct dsa_tag_driver *dsa_tag_driver_array[] = { +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) + &DSA_TAG_DRIVER_NAME(brcm_netdev_ops), +#endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) + &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops), +#endif +}; + +module_dsa_tag_drivers(dsa_tag_driver_array); + +MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 67ff3fae18d8..7ddec9794477 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/etherdevice.h> @@ -154,9 +150,16 @@ static int dsa_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, return 0; } -const struct dsa_device_ops dsa_netdev_ops = { +static const struct dsa_device_ops dsa_netdev_ops = { + .name = "dsa", + .proto = DSA_TAG_PROTO_DSA, .xmit = dsa_xmit, .rcv = dsa_rcv, .flow_dissect = dsa_tag_flow_dissect, .overhead = DSA_HLEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA); + +module_dsa_tag_driver(dsa_netdev_ops); diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 234585ec116e..e8eaa804ccb9 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * net/dsa/tag_edsa.c - Ethertype DSA tagging * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/etherdevice.h> @@ -173,9 +169,16 @@ static int edsa_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, return 0; } -const struct dsa_device_ops edsa_netdev_ops = { +static const struct dsa_device_ops edsa_netdev_ops = { + .name = "edsa", + .proto = DSA_TAG_PROTO_EDSA, .xmit = edsa_xmit, .rcv = edsa_rcv, .flow_dissect = edsa_tag_flow_dissect, .overhead = EDSA_HLEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); + +module_dsa_tag_driver(edsa_netdev_ops); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index cb6f82ffe5eb..b678160bbd66 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -103,8 +103,15 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, return skb; } -const struct dsa_device_ops gswip_netdev_ops = { +static const struct dsa_device_ops gswip_netdev_ops = { + .name = "gwsip", + .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, .overhead = GSWIP_RX_HEADER_LEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP); + +module_dsa_tag_driver(gswip_netdev_ops); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index de246c93d3bb..b4872b87d4a6 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * net/dsa/tag_ksz.c - Microchip KSZ Switch tag format handling * Copyright (c) 2017 Microchip Technology - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/etherdevice.h> @@ -137,12 +133,17 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, return ksz_common_rcv(skb, dev, port, len); } -const struct dsa_device_ops ksz9477_netdev_ops = { +static const struct dsa_device_ops ksz9477_netdev_ops = { + .name = "ksz9477", + .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, .overhead = KSZ9477_INGRESS_TAG_LEN, }; +DSA_TAG_DRIVER(ksz9477_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477); + #define KSZ9893_TAIL_TAG_OVERRIDE BIT(5) #define KSZ9893_TAIL_TAG_LOOKUP BIT(6) @@ -170,8 +171,22 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, return nskb; } -const struct dsa_device_ops ksz9893_netdev_ops = { +static const struct dsa_device_ops ksz9893_netdev_ops = { + .name = "ksz9893", + .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, .overhead = KSZ_INGRESS_TAG_LEN, }; + +DSA_TAG_DRIVER(ksz9893_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893); + +static struct dsa_tag_driver *dsa_tag_driver_array[] = { + &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops), + &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops), +}; + +module_dsa_tag_drivers(dsa_tag_driver_array); + +MODULE_LICENSE("GPL"); diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index f48889e46ff7..eb0e7a32e53d 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -1,15 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Pengutronix, Juergen Borleis <jbe@pengutronix.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/dsa/lan9303.h> #include <linux/etherdevice.h> @@ -137,8 +128,15 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return skb; } -const struct dsa_device_ops lan9303_netdev_ops = { +static const struct dsa_device_ops lan9303_netdev_ops = { + .name = "lan9303", + .proto = DSA_TAG_PROTO_LAN9303, .xmit = lan9303_xmit, .rcv = lan9303_rcv, .overhead = LAN9303_TAG_LEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303); + +module_dsa_tag_driver(lan9303_netdev_ops); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index f39f4dfeda34..b5705cba8318 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -1,15 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Mediatek DSA Tag support * Copyright (C) 2017 Landen Chao <landen.chao@mediatek.com> * Sean Wang <sean.wang@mediatek.com> - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/etherdevice.h> @@ -105,9 +98,16 @@ static int mtk_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, return 0; } -const struct dsa_device_ops mtk_netdev_ops = { +static const struct dsa_device_ops mtk_netdev_ops = { + .name = "mtk", + .proto = DSA_TAG_PROTO_MTK, .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, .flow_dissect = mtk_tag_flow_dissect, .overhead = MTK_HDR_LEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK); + +module_dsa_tag_driver(mtk_netdev_ops); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index ed4f6dc26365..c95885215525 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2015, The Linux Foundation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 and - * only version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/etherdevice.h> @@ -98,8 +90,25 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, return skb; } -const struct dsa_device_ops qca_netdev_ops = { +static int qca_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + *offset = QCA_HDR_LEN; + *proto = ((__be16 *)skb->data)[0]; + + return 0; +} + +static const struct dsa_device_ops qca_netdev_ops = { + .name = "qca", + .proto = DSA_TAG_PROTO_QCA, .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, + .flow_dissect = qca_tag_flow_dissect, .overhead = QCA_HDR_LEN, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA); + +module_dsa_tag_driver(qca_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c new file mode 100644 index 000000000000..969402c7dbf1 --- /dev/null +++ b/net/dsa/tag_sja1105.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com> + */ +#include <linux/if_vlan.h> +#include <linux/dsa/sja1105.h> +#include <linux/dsa/8021q.h> +#include <linux/packing.h> +#include "dsa_priv.h" + +/* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ +static inline bool sja1105_is_link_local(const struct sk_buff *skb) +{ + const struct ethhdr *hdr = eth_hdr(skb); + u64 dmac = ether_addr_to_u64(hdr->h_dest); + + if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) == + SJA1105_LINKLOCAL_FILTER_A) + return true; + if ((dmac & SJA1105_LINKLOCAL_FILTER_B_MASK) == + SJA1105_LINKLOCAL_FILTER_B) + return true; + return false; +} + +/* This is the first time the tagger sees the frame on RX. + * Figure out if we can decode it, and if we can, annotate skb->cb with how we + * plan to do that, so we don't need to check again in the rcv function. + */ +static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) +{ + if (sja1105_is_link_local(skb)) { + SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_LINK_LOCAL; + return true; + } + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) { + SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_NORMAL; + return true; + } + return false; +} + +static struct sk_buff *sja1105_xmit(struct sk_buff *skb, + struct net_device *netdev) +{ + struct dsa_port *dp = dsa_slave_to_port(netdev); + struct dsa_switch *ds = dp->ds; + u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index); + u8 pcp = skb->priority; + + /* Transmitting management traffic does not rely upon switch tagging, + * but instead SPI-installed management routes. Part 2 of this + * is the .port_deferred_xmit driver callback. + */ + if (unlikely(sja1105_is_link_local(skb))) + return dsa_defer_xmit(skb, netdev); + + /* If we are under a vlan_filtering bridge, IP termination on + * switch ports based on 802.1Q tags is simply too brittle to + * be passable. So just defer to the dsa_slave_notag_xmit + * implementation. + */ + if (dsa_port_is_vlan_filtering(dp)) + return skb; + + return dsa_8021q_xmit(skb, netdev, ETH_P_SJA1105, + ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); +} + +static struct sk_buff *sja1105_rcv(struct sk_buff *skb, + struct net_device *netdev, + struct packet_type *pt) +{ + struct ethhdr *hdr = eth_hdr(skb); + u64 source_port, switch_id; + struct sk_buff *nskb; + u16 tpid, vid, tci; + bool is_tagged; + + nskb = dsa_8021q_rcv(skb, netdev, pt, &tpid, &tci); + is_tagged = (nskb && tpid == ETH_P_SJA1105); + + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + vid = tci & VLAN_VID_MASK; + + skb->offload_fwd_mark = 1; + + if (SJA1105_SKB_CB(skb)->type == SJA1105_FRAME_TYPE_LINK_LOCAL) { + /* Management traffic path. Switch embeds the switch ID and + * port ID into bytes of the destination MAC, courtesy of + * the incl_srcpt options. + */ + source_port = hdr->h_dest[3]; + switch_id = hdr->h_dest[4]; + /* Clear the DMAC bytes that were mangled by the switch */ + hdr->h_dest[3] = 0; + hdr->h_dest[4] = 0; + } else { + /* Normal traffic path. */ + source_port = dsa_8021q_rx_source_port(vid); + switch_id = dsa_8021q_rx_switch_id(vid); + } + + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (!skb->dev) { + netdev_warn(netdev, "Couldn't decode source port\n"); + return NULL; + } + + /* Delete/overwrite fake VLAN header, DSA expects to not find + * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). + */ + if (is_tagged) + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - VLAN_HLEN, + ETH_HLEN - VLAN_HLEN); + + return skb; +} + +static struct dsa_device_ops sja1105_netdev_ops = { + .name = "sja1105", + .proto = DSA_TAG_PROTO_SJA1105, + .xmit = sja1105_xmit, + .rcv = sja1105_rcv, + .filter = sja1105_filter, + .overhead = VLAN_HLEN, +}; + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105); + +module_dsa_tag_driver(sja1105_netdev_ops); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b40756ed6e57..4f8ab62f0208 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * net/dsa/tag_trailer.c - Trailer tag format handling * Copyright (c) 2008-2009 Marvell Semiconductor - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/etherdevice.h> @@ -81,8 +77,15 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, return skb; } -const struct dsa_device_ops trailer_netdev_ops = { +static const struct dsa_device_ops trailer_netdev_ops = { + .name = "trailer", + .proto = DSA_TAG_PROTO_TRAILER, .xmit = trailer_xmit, .rcv = trailer_rcv, .overhead = 4, }; + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER); + +module_dsa_tag_driver(trailer_netdev_ops); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f7a3d7a171c7..4b2b222377ac 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -119,13 +119,14 @@ EXPORT_SYMBOL(eth_header); /** * eth_get_headlen - determine the length of header for an ethernet frame + * @dev: pointer to network device * @data: pointer to start of frame * @len: total length of frame * * Make a best effort attempt to pull the length for all of the headers for * a given frame in a linear buffer. */ -u32 eth_get_headlen(void *data, unsigned int len) +u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; @@ -136,8 +137,9 @@ u32 eth_get_headlen(void *data, unsigned int len) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto, - sizeof(*eth), len, flags)) + if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data, + eth->h_proto, sizeof(*eth), + len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ @@ -183,8 +185,12 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. + * The DSA tagging protocol may be able to decode some but not all + * traffic (for example only for management). In that case give it the + * option to filter the packets from which it can decode source port + * information. */ - if (unlikely(netdev_uses_dsa(dev))) + if (unlikely(netdev_uses_dsa(dev)) && dsa_can_decode(skb, dev)) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) @@ -554,7 +560,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) addr = NULL; if (dp) addr = of_get_mac_address(dp); - if (!addr) + if (IS_ERR_OR_NULL(addr)) addr = arch_get_platform_mac_address(); if (!addr) diff --git a/net/hsr/Makefile b/net/hsr/Makefile index 9ae972a820f4..e45757fc477f 100644 --- a/net/hsr/Makefile +++ b/net/hsr/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_HSR) += hsr.o hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \ hsr_netlink.o hsr_slave.o hsr_forward.o +hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c new file mode 100644 index 000000000000..94447974a3c0 --- /dev/null +++ b/net/hsr/hsr_debugfs.c @@ -0,0 +1,119 @@ +/* + * hsr_debugfs code + * Copyright (C) 2019 Texas Instruments Incorporated + * + * Author(s): + * Murali Karicheri <m-karicheri2@ti.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation version 2. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/debugfs.h> +#include "hsr_main.h" +#include "hsr_framereg.h" + +static void print_mac_address(struct seq_file *sfp, unsigned char *mac) +{ + seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); +} + +/* hsr_node_table_show - Formats and prints node_table entries */ +static int +hsr_node_table_show(struct seq_file *sfp, void *data) +{ + struct hsr_priv *priv = (struct hsr_priv *)sfp->private; + struct hsr_node *node; + + seq_puts(sfp, "Node Table entries\n"); + seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], "); + seq_puts(sfp, "time_in[B], Address-B port\n"); + rcu_read_lock(); + list_for_each_entry_rcu(node, &priv->node_db, mac_list) { + /* skip self node */ + if (hsr_addr_is_self(priv, node->macaddress_A)) + continue; + print_mac_address(sfp, &node->macaddress_A[0]); + seq_puts(sfp, " "); + print_mac_address(sfp, &node->macaddress_B[0]); + seq_printf(sfp, "0x%lx, ", node->time_in[HSR_PT_SLAVE_A]); + seq_printf(sfp, "0x%lx ", node->time_in[HSR_PT_SLAVE_B]); + seq_printf(sfp, "0x%x\n", node->addr_B_port); + } + rcu_read_unlock(); + return 0; +} + +/* hsr_node_table_open - Open the node_table file + * + * Description: + * This routine opens a debugfs file node_table of specific hsr device + */ +static int +hsr_node_table_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, hsr_node_table_show, inode->i_private); +} + +static const struct file_operations hsr_fops = { + .owner = THIS_MODULE, + .open = hsr_node_table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* hsr_debugfs_init - create hsr node_table file for dumping + * the node table + * + * Description: + * When debugfs is configured this routine sets up the node_table file per + * hsr device for dumping the node_table entries + */ +int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) +{ + int rc = -1; + struct dentry *de = NULL; + + de = debugfs_create_dir(hsr_dev->name, NULL); + if (!de) { + pr_err("Cannot create hsr debugfs root\n"); + return rc; + } + + priv->node_tbl_root = de; + + de = debugfs_create_file("node_table", S_IFREG | 0444, + priv->node_tbl_root, priv, + &hsr_fops); + if (!de) { + pr_err("Cannot create hsr node_table directory\n"); + return rc; + } + priv->node_tbl_file = de; + + return 0; +} + +/* hsr_debugfs_term - Tear down debugfs intrastructure + * + * Description: + * When Debufs is configured this routine removes debugfs file system + * elements that are specific to hsr + */ +void +hsr_debugfs_term(struct hsr_priv *priv) +{ + debugfs_remove(priv->node_tbl_file); + priv->node_tbl_file = NULL; + debugfs_remove(priv->node_tbl_root); + priv->node_tbl_root = NULL; +} diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index a97bf326b231..15c72065df79 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * @@ -23,7 +19,6 @@ #include "hsr_main.h" #include "hsr_forward.h" - static bool is_admin_up(struct net_device *dev) { return dev && (dev->flags & IFF_UP); @@ -68,7 +63,7 @@ static bool hsr_check_carrier(struct hsr_port *master) rcu_read_lock(); hsr_for_each_port(master->hsr, port) - if ((port->type != HSR_PT_MASTER) && is_slave_up(port->dev)) { + if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { has_carrier = true; break; } @@ -82,7 +77,6 @@ static bool hsr_check_carrier(struct hsr_port *master) return has_carrier; } - static void hsr_check_announce(struct net_device *hsr_dev, unsigned char old_operstate) { @@ -90,15 +84,14 @@ static void hsr_check_announce(struct net_device *hsr_dev, hsr = netdev_priv(hsr_dev); - if ((hsr_dev->operstate == IF_OPER_UP) - && (old_operstate != IF_OPER_UP)) { + if (hsr_dev->operstate == IF_OPER_UP && old_operstate != IF_OPER_UP) { /* Went up */ hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } - if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP)) + if (hsr_dev->operstate != IF_OPER_UP && old_operstate == IF_OPER_UP) /* Went down */ del_timer(&hsr->announce_timer); } @@ -136,7 +129,6 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) return mtu_max - HSR_HLEN; } - static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; @@ -191,14 +183,12 @@ static int hsr_dev_open(struct net_device *dev) return 0; } - static int hsr_dev_close(struct net_device *dev) { /* Nothing to do here. */ return 0; } - static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, netdev_features_t features) { @@ -231,7 +221,6 @@ static netdev_features_t hsr_fix_features(struct net_device *dev, return hsr_features_recompute(hsr, features); } - static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); @@ -244,14 +233,13 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } - static const struct header_ops hsr_header_ops = { .create = eth_header, .parse = eth_header_parse, }; static void send_hsr_supervision_frame(struct hsr_port *master, - u8 type, u8 hsrVer) + u8 type, u8 hsr_ver) { struct sk_buff *skb; int hlen, tlen; @@ -262,39 +250,38 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; - skb = dev_alloc_skb( - sizeof(struct hsr_tag) + - sizeof(struct hsr_sup_tag) + - sizeof(struct hsr_sup_payload) + hlen + tlen); + skb = dev_alloc_skb(sizeof(struct hsr_tag) + + sizeof(struct hsr_sup_tag) + + sizeof(struct hsr_sup_payload) + hlen + tlen); - if (skb == NULL) + if (!skb) return; skb_reserve(skb, hlen); skb->dev = master->dev; - skb->protocol = htons(hsrVer ? ETH_P_HSR : ETH_P_PRP); + skb->protocol = htons(hsr_ver ? ETH_P_HSR : ETH_P_PRP); skb->priority = TC_PRIO_CONTROL; - if (dev_hard_header(skb, skb->dev, (hsrVer ? ETH_P_HSR : ETH_P_PRP), + if (dev_hard_header(skb, skb->dev, (hsr_ver ? ETH_P_HSR : ETH_P_PRP), master->hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); - if (hsrVer > 0) { + if (hsr_ver > 0) { hsr_tag = skb_put(skb, sizeof(struct hsr_tag)); hsr_tag->encap_proto = htons(ETH_P_PRP); set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE); } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); - set_hsr_stag_path(hsr_stag, (hsrVer ? 0x0 : 0xf)); - set_hsr_stag_HSR_Ver(hsr_stag, hsrVer); + set_hsr_stag_path(hsr_stag, (hsr_ver ? 0x0 : 0xf)); + set_hsr_stag_HSR_ver(hsr_stag, hsr_ver); /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); - if (hsrVer > 0) { + if (hsr_ver > 0) { hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr); hsr_tag->sequence_nr = htons(master->hsr->sequence_nr); master->hsr->sup_sequence_nr++; @@ -305,13 +292,14 @@ static void send_hsr_supervision_frame(struct hsr_port *master, } spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); - hsr_stag->HSR_TLV_Type = type; + hsr_stag->HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ - hsr_stag->HSR_TLV_Length = hsrVer ? sizeof(struct hsr_sup_payload) : 12; + hsr_stag->HSR_TLV_length = + hsr_ver ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); - ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); + ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return; @@ -324,7 +312,6 @@ out: kfree_skb(skb); } - /* Announce (supervision frame) timer function */ static void hsr_announce(struct timer_list *t) @@ -338,15 +325,15 @@ static void hsr_announce(struct timer_list *t) rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - if (hsr->announce_count < 3 && hsr->protVersion == 0) { + if (hsr->announce_count < 3 && hsr->prot_version == 0) { send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE, - hsr->protVersion); + hsr->prot_version); hsr->announce_count++; interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); } else { send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK, - hsr->protVersion); + hsr->prot_version); interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); } @@ -357,7 +344,6 @@ static void hsr_announce(struct timer_list *t) rcu_read_unlock(); } - /* According to comments in the declaration of struct net_device, this function * is "Called from unregister, can be used to call free_netdev". Ok then... */ @@ -368,6 +354,8 @@ static void hsr_dev_destroy(struct net_device *hsr_dev) hsr = netdev_priv(hsr_dev); + hsr_debugfs_term(hsr); + rtnl_lock(); hsr_for_each_port(hsr, port) hsr_del_port(port); @@ -423,7 +411,6 @@ void hsr_dev_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } - /* Return true if dev is a HSR master; return false otherwise. */ inline bool is_hsr_master(struct net_device *dev) @@ -467,7 +454,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; - hsr->protVersion = protocol_version; + hsr->prot_version = protocol_version; /* FIXME: should I modify the value of these? * @@ -498,6 +485,9 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], goto fail; mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); + res = hsr_debugfs_init(hsr, hsr_dev); + if (res) + goto fail; return 0; diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 9975e31bbb82..6d7759c4f5f9 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 04b5450c5a55..ddd9605bad04 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ @@ -17,7 +13,6 @@ #include "hsr_main.h" #include "hsr_framereg.h" - struct hsr_node; struct hsr_frame_info { @@ -32,7 +27,6 @@ struct hsr_frame_info { bool is_local_exclusive; }; - /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if @@ -50,46 +44,45 @@ struct hsr_frame_info { */ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { - struct ethhdr *ethHdr; - struct hsr_sup_tag *hsrSupTag; - struct hsrv1_ethhdr_sp *hsrV1Hdr; + struct ethhdr *eth_hdr; + struct hsr_sup_tag *hsr_sup_tag; + struct hsrv1_ethhdr_sp *hsr_V1_hdr; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); - ethHdr = (struct ethhdr *) skb_mac_header(skb); + eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Correct addr? */ - if (!ether_addr_equal(ethHdr->h_dest, + if (!ether_addr_equal(eth_hdr->h_dest, hsr->sup_multicast_addr)) return false; /* Correct ether type?. */ - if (!(ethHdr->h_proto == htons(ETH_P_PRP) - || ethHdr->h_proto == htons(ETH_P_HSR))) + if (!(eth_hdr->h_proto == htons(ETH_P_PRP) || + eth_hdr->h_proto == htons(ETH_P_HSR))) return false; /* Get the supervision header from correct location. */ - if (ethHdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ - hsrV1Hdr = (struct hsrv1_ethhdr_sp *) skb_mac_header(skb); - if (hsrV1Hdr->hsr.encap_proto != htons(ETH_P_PRP)) + if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ + hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb); + if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP)) return false; - hsrSupTag = &hsrV1Hdr->hsr_sup; + hsr_sup_tag = &hsr_V1_hdr->hsr_sup; } else { - hsrSupTag = &((struct hsrv0_ethhdr_sp *) skb_mac_header(skb))->hsr_sup; + hsr_sup_tag = + &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup; } - if ((hsrSupTag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) && - (hsrSupTag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) + if (hsr_sup_tag->HSR_TLV_type != HSR_TLV_ANNOUNCE && + hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK) return false; - if ((hsrSupTag->HSR_TLV_Length != 12) && - (hsrSupTag->HSR_TLV_Length != - sizeof(struct hsr_sup_payload))) + if (hsr_sup_tag->HSR_TLV_length != 12 && + hsr_sup_tag->HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; return true; } - static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, struct hsr_frame_info *frame) { @@ -100,7 +93,7 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, skb_pull(skb_in, HSR_HLEN); skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); skb_push(skb_in, HSR_HLEN); - if (skb == NULL) + if (!skb) return NULL; skb_reset_mac_header(skb); @@ -108,7 +101,7 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in, if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start -= HSR_HLEN; - copylen = 2*ETH_ALEN; + copylen = 2 * ETH_ALEN; if (frame->is_vlan) copylen += VLAN_HLEN; src = skb_mac_header(skb_in); @@ -127,9 +120,8 @@ static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, return skb_clone(frame->skb_std, GFP_ATOMIC); } - static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, - struct hsr_port *port, u8 protoVersion) + struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; int lane_id; @@ -144,13 +136,13 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, if (frame->is_vlan) lsdu_size -= 4; - hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; - hsr_ethhdr->ethhdr.h_proto = htons(protoVersion ? + hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); } @@ -164,7 +156,7 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, /* Create the new skb with enough headroom to fit the HSR tag */ skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC); - if (skb == NULL) + if (!skb) return NULL; skb_reset_mac_header(skb); @@ -180,7 +172,7 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, memmove(dst, src, movelen); skb_reset_mac_header(skb); - hsr_fill_tag(skb, frame, port, port->hsr->protVersion); + hsr_fill_tag(skb, frame, port, port->hsr->prot_version); return skb; } @@ -194,7 +186,7 @@ static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, if (frame->skb_hsr) return skb_clone(frame->skb_hsr, GFP_ATOMIC); - if ((port->type != HSR_PT_SLAVE_A) && (port->type != HSR_PT_SLAVE_B)) { + if (port->type != HSR_PT_SLAVE_A && port->type != HSR_PT_SLAVE_B) { WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port"); return NULL; } @@ -202,7 +194,6 @@ static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame, return create_tagged_skb(frame->skb_std, frame, port); } - static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { @@ -237,7 +228,6 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, return dev_queue_xmit(skb); } - /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before @@ -260,11 +250,11 @@ static void hsr_forward_do(struct hsr_frame_info *frame) continue; /* Don't deliver locally unless we should */ - if ((port->type == HSR_PT_MASTER) && !frame->is_local_dest) + if (port->type == HSR_PT_MASTER && !frame->is_local_dest) continue; /* Deliver frames directly addressed to us to master only */ - if ((port->type != HSR_PT_MASTER) && frame->is_local_exclusive) + if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; /* Don't send frame over port where it has been sent before */ @@ -272,7 +262,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) frame->sequence_nr)) continue; - if (frame->is_supervision && (port->type == HSR_PT_MASTER)) { + if (frame->is_supervision && port->type == HSR_PT_MASTER) { hsr_handle_sup_frame(frame->skb_hsr, frame->node_src, frame->port_rcv); @@ -283,7 +273,7 @@ static void hsr_forward_do(struct hsr_frame_info *frame) skb = frame_get_tagged_skb(frame, port); else skb = frame_get_stripped_skb(frame, port); - if (skb == NULL) { + if (!skb) { /* FIXME: Record the dropped frame? */ continue; } @@ -296,7 +286,6 @@ static void hsr_forward_do(struct hsr_frame_info *frame) } } - static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, struct hsr_frame_info *frame) { @@ -307,16 +296,15 @@ static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, frame->is_local_exclusive = false; } - if ((skb->pkt_type == PACKET_HOST) || - (skb->pkt_type == PACKET_MULTICAST) || - (skb->pkt_type == PACKET_BROADCAST)) { + if (skb->pkt_type == PACKET_HOST || + skb->pkt_type == PACKET_MULTICAST || + skb->pkt_type == PACKET_BROADCAST) { frame->is_local_dest = true; } else { frame->is_local_dest = false; } } - static int hsr_fill_frame_info(struct hsr_frame_info *frame, struct sk_buff *skb, struct hsr_port *port) { @@ -325,18 +313,18 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame, frame->is_supervision = is_supervision_frame(port->hsr, skb); frame->node_src = hsr_get_node(port, skb, frame->is_supervision); - if (frame->node_src == NULL) + if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ - ethhdr = (struct ethhdr *) skb_mac_header(skb); + ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; if (ethhdr->h_proto == htons(ETH_P_8021Q)) { frame->is_vlan = true; /* FIXME: */ WARN_ONCE(1, "HSR: VLAN not yet supported"); } - if (ethhdr->h_proto == htons(ETH_P_PRP) - || ethhdr->h_proto == htons(ETH_P_HSR)) { + if (ethhdr->h_proto == htons(ETH_P_PRP) || + ethhdr->h_proto == htons(ETH_P_HSR)) { frame->skb_std = NULL; frame->skb_hsr = skb; frame->sequence_nr = hsr_get_skb_sequence_nr(skb); @@ -371,10 +359,17 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); + /* Gets called for ingress frames as well as egress from master port. + * So check and increment stats for master port only here. + */ + if (port->type == HSR_PT_MASTER) { + port->dev->stats.tx_packets++; + port->dev->stats.tx_bytes += skb->len; + } - if (frame.skb_hsr != NULL) + if (frame.skb_hsr) kfree_skb(frame.skb_hsr); - if (frame.skb_std != NULL) + if (frame.skb_std) kfree_skb(frame.skb_std); return; diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 5c5bc4b6b75f..51a69295566c 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 9af16cb68f76..9fa9abd83018 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * @@ -22,23 +18,8 @@ #include "hsr_framereg.h" #include "hsr_netlink.h" - -struct hsr_node { - struct list_head mac_list; - unsigned char MacAddressA[ETH_ALEN]; - unsigned char MacAddressB[ETH_ALEN]; - /* Local slave through which AddrB frames are received from this node */ - enum hsr_port_type AddrB_port; - unsigned long time_in[HSR_PT_PORTS]; - bool time_in_stale[HSR_PT_PORTS]; - u16 seq_out[HSR_PT_PORTS]; - struct rcu_head rcu_head; -}; - - /* TODO: use hash lists for mac addresses (linux/jhash.h)? */ - /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ @@ -47,16 +28,16 @@ static bool seq_nr_after(u16 a, u16 b) /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ - if ((int) b - a == 32768) + if ((int)b - a == 32768) return false; - return (((s16) (b - a)) < 0); + return (((s16)(b - a)) < 0); } + #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b))) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) - bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_node *node; @@ -68,9 +49,9 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) return false; } - if (ether_addr_equal(addr, node->MacAddressA)) + if (ether_addr_equal(addr, node->macaddress_A)) return true; - if (ether_addr_equal(addr, node->MacAddressB)) + if (ether_addr_equal(addr, node->macaddress_B)) return true; return false; @@ -78,20 +59,19 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) /* Search for mac entry. Caller must hold rcu read lock. */ -static struct hsr_node *find_node_by_AddrA(struct list_head *node_db, - const unsigned char addr[ETH_ALEN]) +static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, addr)) + if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } - /* Helper for device init; the self_node_db is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ @@ -105,12 +85,12 @@ int hsr_create_self_node(struct list_head *self_node_db, if (!node) return -ENOMEM; - ether_addr_copy(node->MacAddressA, addr_a); - ether_addr_copy(node->MacAddressB, addr_b); + ether_addr_copy(node->macaddress_A, addr_a); + ether_addr_copy(node->macaddress_B, addr_b); rcu_read_lock(); oldnode = list_first_or_null_rcu(self_node_db, - struct hsr_node, mac_list); + struct hsr_node, mac_list); if (oldnode) { list_replace_rcu(&oldnode->mac_list, &node->mac_list); rcu_read_unlock(); @@ -137,7 +117,7 @@ void hsr_del_node(struct list_head *self_node_db) } } -/* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA; +/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ @@ -152,7 +132,7 @@ struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[], if (!node) return NULL; - ether_addr_copy(node->MacAddressA, addr); + ether_addr_copy(node->macaddress_A, addr); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). @@ -181,19 +161,19 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, if (!skb_mac_header_was_set(skb)) return NULL; - ethhdr = (struct ethhdr *) skb_mac_header(skb); + ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { - if (ether_addr_equal(node->MacAddressA, ethhdr->h_source)) + if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) return node; - if (ether_addr_equal(node->MacAddressB, ethhdr->h_source)) + if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) return node; } /* Everyone may create a node entry, connected node to a HSR device. */ - if (ethhdr->h_proto == htons(ETH_P_PRP) - || ethhdr->h_proto == htons(ETH_P_HSR)) { + if (ethhdr->h_proto == htons(ETH_P_PRP) || + ethhdr->h_proto == htons(ETH_P_HSR)) { /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ @@ -210,8 +190,8 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb, return hsr_add_node(node_db, ethhdr->h_source, seq_out); } -/* Use the Supervision frame's info about an eventual MacAddressB for merging - * nodes that has previously had their MacAddressB registered as a separate +/* Use the Supervision frame's info about an eventual macaddress_B for merging + * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, @@ -223,7 +203,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, struct list_head *node_db; int i; - ethhdr = (struct ethhdr *) skb_mac_header(skb); + ethhdr = (struct ethhdr *)skb_mac_header(skb); /* Leave the ethernet header. */ skb_pull(skb, sizeof(struct ethhdr)); @@ -235,14 +215,14 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, /* And leave the HSR sup tag. */ skb_pull(skb, sizeof(struct hsr_sup_tag)); - hsr_sp = (struct hsr_sup_payload *) skb->data; + hsr_sp = (struct hsr_sup_payload *)skb->data; - /* Merge node_curr (registered on MacAddressB) into node_real */ + /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; - node_real = find_node_by_AddrA(node_db, hsr_sp->MacAddressA); + node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ - node_real = hsr_add_node(node_db, hsr_sp->MacAddressA, + node_real = hsr_add_node(node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1); if (!node_real) goto done; /* No mem */ @@ -250,17 +230,18 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, /* Node has already been merged */ goto done; - ether_addr_copy(node_real->MacAddressB, ethhdr->h_source); + ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; - node_real->time_in_stale[i] = node_curr->time_in_stale[i]; + node_real->time_in_stale[i] = + node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } - node_real->AddrB_port = port_rcv->type; + node_real->addr_B_port = port_rcv->type; list_del_rcu(&node_curr->mac_list); kfree_rcu(node_curr, rcu_head); @@ -269,11 +250,10 @@ done: skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); } - /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source - * address with that node's "official" address (MacAddressA) so that upper + * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) @@ -283,7 +263,7 @@ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) return; } - memcpy(ð_hdr(skb)->h_source, node->MacAddressA, ETH_ALEN); + memcpy(ð_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN); } /* 'skb' is a frame meant for another host. @@ -308,18 +288,18 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; - node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest); + node_dst = find_node_by_addr_A(&port->hsr->node_db, + eth_hdr(skb)->h_dest); if (!node_dst) { WARN_ONCE(1, "%s: Unknown node\n", __func__); return; } - if (port->type != node_dst->AddrB_port) + if (port->type != node_dst->addr_B_port) return; - ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->MacAddressB); + ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B); } - void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { @@ -352,7 +332,6 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, return 0; } - static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { @@ -373,7 +352,6 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr, return NULL; } - /* Remove stale sequence_nr records. Called by timer every * HSR_LIFE_CHECK_INTERVAL (two seconds or so). */ @@ -392,9 +370,9 @@ void hsr_prune_nodes(struct timer_list *t) time_b = node->time_in[HSR_PT_SLAVE_B]; /* Check for timestamps old enough to risk wrap-around */ - if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2)) + if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_A] = true; - if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2)) + if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) node->time_in_stale[HSR_PT_SLAVE_B] = true; /* Get age of newest frame from node. @@ -409,26 +387,29 @@ void hsr_prune_nodes(struct timer_list *t) /* Warn of ring error only as long as we get frames at all */ if (time_is_after_jiffies(timestamp + - msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) { + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { rcu_read_lock(); port = get_late_port(hsr, node); - if (port != NULL) - hsr_nl_ringerror(hsr, node->MacAddressA, port); + if (port) + hsr_nl_ringerror(hsr, node->macaddress_A, port); rcu_read_unlock(); } /* Prune old entries */ if (time_is_before_jiffies(timestamp + - msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { - hsr_nl_nodedown(hsr, node->MacAddressA); + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { + hsr_nl_nodedown(hsr, node->macaddress_A); list_del_rcu(&node->mac_list); /* Note that we need to free this entry later: */ kfree_rcu(node, rcu_head); } } rcu_read_unlock(); -} + /* Restart timer */ + mod_timer(&hsr->prune_timer, + jiffies + msecs_to_jiffies(PRUNE_PERIOD)); +} void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) @@ -439,20 +420,19 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, node = list_first_or_null_rcu(&hsr->node_db, struct hsr_node, mac_list); if (node) - ether_addr_copy(addr, node->MacAddressA); + ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { - ether_addr_copy(addr, node->MacAddressA); + ether_addr_copy(addr, node->macaddress_A); return node; } return NULL; } - int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], @@ -466,15 +446,14 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_port *port; unsigned long tdiff; - rcu_read_lock(); - node = find_node_by_AddrA(&hsr->node_db, addr); + node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) { rcu_read_unlock(); return -ENOENT; /* No such entry */ } - ether_addr_copy(addr_b, node->MacAddressB); + ether_addr_copy(addr_b, node->macaddress_B); tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A]; if (node->time_in_stale[HSR_PT_SLAVE_A]) @@ -500,8 +479,8 @@ int hsr_get_node_data(struct hsr_priv *hsr, *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; - if (node->AddrB_port != HSR_PT_NONE) { - port = hsr_port_get_hsr(hsr, node->AddrB_port); + if (node->addr_B_port != HSR_PT_NONE) { + port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; } else { *addr_b_ifindex = -1; diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 531fd3dfcac1..a3bdcdab469d 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ @@ -52,4 +48,16 @@ int hsr_get_node_data(struct hsr_priv *hsr, int *if2_age, u16 *if2_seq); +struct hsr_node { + struct list_head mac_list; + unsigned char macaddress_A[ETH_ALEN]; + unsigned char macaddress_B[ETH_ALEN]; + /* Local slave through which AddrB frames are received from this node */ + enum hsr_port_type addr_B_port; + unsigned long time_in[HSR_PT_PORTS]; + bool time_in_stale[HSR_PT_PORTS]; + u16 seq_out[HSR_PT_PORTS]; + struct rcu_head rcu_head; +}; + #endif /* __HSR_FRAMEREG_H */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index cd37d0011b42..b9988a662ee1 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ @@ -19,7 +15,6 @@ #include "hsr_framereg.h" #include "hsr_slave.h" - static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -31,12 +26,12 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, dev = netdev_notifier_info_to_dev(ptr); port = hsr_port_get_rtnl(dev); - if (port == NULL) { + if (!port) { if (!is_hsr_master(dev)) return NOTIFY_DONE; /* Not an HSR device */ hsr = netdev_priv(dev); port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - if (port == NULL) { + if (!port) { /* Resend of notification concerning removed device? */ return NOTIFY_DONE; } @@ -63,7 +58,8 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, if (port->type == HSR_PT_SLAVE_A) { ether_addr_copy(master->dev->dev_addr, dev->dev_addr); - call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + call_netdevice_notifiers(NETDEV_CHANGEADDR, + master->dev); } /* Make sure we recognize frames from ourselves in hsr_rcv() */ @@ -97,7 +93,6 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; } - struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; @@ -112,7 +107,6 @@ static struct notifier_block hsr_nb = { .notifier_call = hsr_netdev_notify, /* Slave event notifications */ }; - static int __init hsr_init(void) { int res; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 9b9909e89e9e..96fac696a1e1 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ @@ -15,7 +11,6 @@ #include <linux/netdevice.h> #include <linux/list.h> - /* Time constants as specified in the HSR specification (IEC-62439-3 2010) * Table 8. * All values in milliseconds. @@ -24,7 +19,6 @@ #define HSR_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ - /* By how much may slave1 and slave2 timestamps of latest received frame from * each node differ before we notify of communication problem? */ @@ -32,17 +26,14 @@ #define HSR_SEQNR_START (USHRT_MAX - 1024) #define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2) - /* How often shall we check for broken ring and remove node entries older than * HSR_NODE_FORGET_TIME? */ #define PRUNE_PERIOD 3000 /* ms */ - #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 - /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, @@ -83,15 +74,14 @@ static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht) static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { - ht->path_and_LSDU_size = htons( - (ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12)); + ht->path_and_LSDU_size = + htons((ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12)); } static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size) { - ht->path_and_LSDU_size = htons( - (ntohs(ht->path_and_LSDU_size) & 0xF000) | - (LSDU_size & 0x0FFF)); + ht->path_and_LSDU_size = htons((ntohs(ht->path_and_LSDU_size) & + 0xF000) | (LSDU_size & 0x0FFF)); } struct hsr_ethhdr { @@ -99,39 +89,38 @@ struct hsr_ethhdr { struct hsr_tag hsr_tag; } __packed; - /* HSR Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. */ struct hsr_sup_tag { - __be16 path_and_HSR_Ver; + __be16 path_and_HSR_ver; __be16 sequence_nr; - __u8 HSR_TLV_Type; - __u8 HSR_TLV_Length; + __u8 HSR_TLV_type; + __u8 HSR_TLV_length; } __packed; struct hsr_sup_payload { - unsigned char MacAddressA[ETH_ALEN]; + unsigned char macaddress_A[ETH_ALEN]; } __packed; static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst) { - return get_hsr_tag_path((struct hsr_tag *) hst); + return get_hsr_tag_path((struct hsr_tag *)hst); } static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst) { - return get_hsr_tag_LSDU_size((struct hsr_tag *) hst); + return get_hsr_tag_LSDU_size((struct hsr_tag *)hst); } static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { - set_hsr_tag_path((struct hsr_tag *) hst, path); + set_hsr_tag_path((struct hsr_tag *)hst, path); } -static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver) +static inline void set_hsr_stag_HSR_ver(struct hsr_sup_tag *hst, u16 HSR_ver) { - set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver); + set_hsr_tag_LSDU_size((struct hsr_tag *)hst, HSR_ver); } struct hsrv0_ethhdr_sp { @@ -145,7 +134,6 @@ struct hsrv1_ethhdr_sp { struct hsr_sup_tag hsr_sup; } __packed; - enum hsr_port_type { HSR_PT_NONE = 0, /* Must be 0, used by framereg */ HSR_PT_SLAVE_A, @@ -171,10 +159,14 @@ struct hsr_priv { struct timer_list prune_timer; int announce_count; u16 sequence_nr; - u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 protVersion; /* Indicate if HSRv0 or HSRv1. */ + u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ + u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ spinlock_t seqnr_lock; /* locking for sequence_nr */ unsigned char sup_multicast_addr[ETH_ALEN]; +#ifdef CONFIG_DEBUG_FS + struct dentry *node_tbl_root; + struct dentry *node_tbl_file; +#endif }; #define hsr_for_each_port(hsr, port) \ @@ -187,8 +179,22 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb) { struct hsr_ethhdr *hsr_ethhdr; - hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb); + hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); return ntohs(hsr_ethhdr->hsr_tag.sequence_nr); } +#if IS_ENABLED(CONFIG_DEBUG_FS) +int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev); +void hsr_debugfs_term(struct hsr_priv *priv); +#else +static inline int hsr_debugfs_init(struct hsr_priv *priv, + struct net_device *hsr_dev) +{ + return 0; +} + +static inline void hsr_debugfs_term(struct hsr_priv *priv) +{} +#endif + #endif /* __HSR_PRIVATE_H */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b9cce0fd5696..8f8337f893ba 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * @@ -28,7 +24,6 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, }; - /* Here, it seems a netdevice has already been allocated for us, and the * hsr_dev_setup routine has been executed. Nice! */ @@ -47,12 +42,14 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, netdev_info(dev, "HSR: Slave1 device not specified\n"); return -EINVAL; } - link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); + link[0] = __dev_get_by_index(src_net, + nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!data[IFLA_HSR_SLAVE2]) { netdev_info(dev, "HSR: Slave2 device not specified\n"); return -EINVAL; } - link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); + link[1] = __dev_get_by_index(src_net, + nla_get_u32(data[IFLA_HSR_SLAVE2])); if (!link[0] || !link[1]) return -ENODEV; @@ -119,8 +116,6 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = { .fill_info = hsr_fill_info, }; - - /* attribute policy */ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = { [HSR_A_NODE_ADDR] = { .len = ETH_ALEN }, @@ -138,8 +133,6 @@ static const struct genl_multicast_group hsr_mcgrps[] = { { .name = "hsr-network", }, }; - - /* This is called if for some node with MAC address addr, we only get frames * over one of the slave interfaces. This would indicate an open network ring * (i.e. a link has failed somewhere). @@ -156,7 +149,8 @@ void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], if (!skb) goto fail; - msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_RING_ERROR); + msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, + HSR_C_RING_ERROR); if (!msg_head) goto nla_put_failure; @@ -201,7 +195,6 @@ void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]) if (!msg_head) goto nla_put_failure; - res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr); if (res < 0) goto nla_put_failure; @@ -221,7 +214,6 @@ fail: rcu_read_unlock(); } - /* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table * about the status of a specific node in the network, defined by its MAC * address. @@ -260,15 +252,13 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) goto invalid; hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) goto invalid; if (!is_hsr_master(hsr_dev)) goto invalid; - /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb_out) { res = -ENOMEM; @@ -276,8 +266,8 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) } msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid, - info->snd_seq, &hsr_genl_family, 0, - HSR_C_SET_NODE_STATUS); + info->snd_seq, &hsr_genl_family, 0, + HSR_C_SET_NODE_STATUS); if (!msg_head) { res = -ENOMEM; goto nla_put_failure; @@ -289,28 +279,30 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) hsr = netdev_priv(hsr_dev); res = hsr_get_node_data(hsr, - (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]), - hsr_node_addr_b, - &addr_b_ifindex, - &hsr_node_if1_age, - &hsr_node_if1_seq, - &hsr_node_if2_age, - &hsr_node_if2_seq); + (unsigned char *) + nla_data(info->attrs[HSR_A_NODE_ADDR]), + hsr_node_addr_b, + &addr_b_ifindex, + &hsr_node_if1_age, + &hsr_node_if1_seq, + &hsr_node_if2_age, + &hsr_node_if2_seq); if (res < 0) goto nla_put_failure; res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, - nla_data(info->attrs[HSR_A_NODE_ADDR])); + nla_data(info->attrs[HSR_A_NODE_ADDR])); if (res < 0) goto nla_put_failure; if (addr_b_ifindex > -1) { res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN, - hsr_node_addr_b); + hsr_node_addr_b); if (res < 0) goto nla_put_failure; - res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, addr_b_ifindex); + res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, + addr_b_ifindex); if (res < 0) goto nla_put_failure; } @@ -392,9 +384,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!is_hsr_master(hsr_dev)) goto invalid; - /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb_out) { res = -ENOMEM; @@ -402,8 +392,8 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) } msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid, - info->snd_seq, &hsr_genl_family, 0, - HSR_C_SET_NODE_LIST); + info->snd_seq, &hsr_genl_family, 0, + HSR_C_SET_NODE_LIST); if (!msg_head) { res = -ENOMEM; goto nla_put_failure; @@ -444,19 +434,18 @@ fail: return res; } - static const struct genl_ops hsr_ops[] = { { .cmd = HSR_C_GET_NODE_STATUS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = hsr_genl_policy, .doit = hsr_get_node_status, .dumpit = NULL, }, { .cmd = HSR_C_GET_NODE_LIST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = hsr_genl_policy, .doit = hsr_get_node_list, .dumpit = NULL, }, @@ -467,6 +456,7 @@ static struct genl_family hsr_genl_family __ro_after_init = { .name = "HSR", .version = 1, .maxattr = HSR_A_MAX, + .policy = hsr_genl_policy, .module = THIS_MODULE, .ops = hsr_ops, .n_ops = ARRAY_SIZE(hsr_ops), diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 3f6b95b5b6b8..1121bb192a18 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 56080da4aa77..88b6705ded83 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ @@ -18,7 +14,6 @@ #include "hsr_forward.h" #include "hsr_framereg.h" - static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; @@ -61,12 +56,11 @@ bool hsr_port_exists(const struct net_device *dev) return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } - static int hsr_check_dev_ok(struct net_device *dev) { /* Don't allow HSR on non-ethernet like devices */ - if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) || - (dev->addr_len != ETH_ALEN)) { + if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN) { netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n"); return -EINVAL; } @@ -99,7 +93,6 @@ static int hsr_check_dev_ok(struct net_device *dev) return 0; } - /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port) { @@ -143,11 +136,11 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, } port = hsr_port_get_hsr(hsr, type); - if (port != NULL) + if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); - if (port == NULL) + if (!port) return -ENOMEM; if (type != HSR_PT_MASTER) { @@ -184,7 +177,7 @@ void hsr_del_port(struct hsr_port *port) list_del_rcu(&port->port_list); if (port != master) { - if (master != NULL) { + if (master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); } diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 3ccfbf71c92e..64b549529592 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -1,11 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se */ diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h index a5d7515b7f62..bc147bc8e36a 100644 --- a/net/ieee802154/ieee802154.h +++ b/net/ieee802154/ieee802154.h @@ -20,7 +20,6 @@ void ieee802154_nl_exit(void); #define IEEE802154_OP(_cmd, _func) \ { \ .cmd = _cmd, \ - .policy = ieee802154_policy, \ .doit = _func, \ .dumpit = NULL, \ .flags = GENL_ADMIN_PERM, \ @@ -29,7 +28,6 @@ void ieee802154_nl_exit(void); #define IEEE802154_DUMP(_cmd, _func, _dump) \ { \ .cmd = _cmd, \ - .policy = ieee802154_policy, \ .doit = _func, \ .dumpit = _dump, \ } diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 96636e3b7aa9..098d67439b6d 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -136,6 +136,7 @@ struct genl_family nl802154_family __ro_after_init = { .name = IEEE802154_NL_NAME, .version = 1, .maxattr = IEEE802154_ATTR_MAX, + .policy = ieee802154_policy, .module = THIS_MODULE, .ops = ieee802154_ops, .n_ops = ARRAY_SIZE(ieee802154_ops), diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 99f6c254ea77..e4c4174f9efb 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -247,9 +247,11 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, rtnl_lock(); if (!cb->args[0]) { - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, - genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, nl802154_policy, NULL); + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl802154_fam.hdrsize, + genl_family_attrbuf(&nl802154_fam), + nl802154_fam.maxattr, + nl802154_policy, NULL); if (err) goto out_unlock; @@ -312,7 +314,7 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, static int nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) { - struct nlattr *nl_flags = nla_nest_start(msg, attr); + struct nlattr *nl_flags = nla_nest_start_noflag(msg, attr); int i; if (!nl_flags) @@ -338,7 +340,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct nlattr *nl_page; unsigned long page; - nl_page = nla_nest_start(msg, NL802154_ATTR_CHANNELS_SUPPORTED); + nl_page = nla_nest_start_noflag(msg, NL802154_ATTR_CHANNELS_SUPPORTED); if (!nl_page) return -ENOBUFS; @@ -360,11 +362,11 @@ nl802154_put_capabilities(struct sk_buff *msg, struct nlattr *nl_caps, *nl_channels; int i; - nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS); + nl_caps = nla_nest_start_noflag(msg, NL802154_ATTR_WPAN_PHY_CAPS); if (!nl_caps) return -ENOBUFS; - nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS); + nl_channels = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CHANNELS); if (!nl_channels) return -ENOBUFS; @@ -380,8 +382,8 @@ nl802154_put_capabilities(struct sk_buff *msg, if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { struct nlattr *nl_ed_lvls; - nl_ed_lvls = nla_nest_start(msg, - NL802154_CAP_ATTR_CCA_ED_LEVELS); + nl_ed_lvls = nla_nest_start_noflag(msg, + NL802154_CAP_ATTR_CCA_ED_LEVELS); if (!nl_ed_lvls) return -ENOBUFS; @@ -396,7 +398,8 @@ nl802154_put_capabilities(struct sk_buff *msg, if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { struct nlattr *nl_tx_pwrs; - nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS); + nl_tx_pwrs = nla_nest_start_noflag(msg, + NL802154_CAP_ATTR_TX_POWERS); if (!nl_tx_pwrs) return -ENOBUFS; @@ -504,7 +507,7 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, if (nl802154_put_capabilities(msg, rdev)) goto nla_put_failure; - nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS); + nl_cmds = nla_nest_start_noflag(msg, NL802154_ATTR_SUPPORTED_COMMANDS); if (!nl_cmds) goto nla_put_failure; @@ -561,8 +564,10 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct nl802154_dump_wpan_phy_state *state) { struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, tb, - nl802154_fam.maxattr, nl802154_policy, NULL); + int ret = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl802154_fam.hdrsize, + tb, nl802154_fam.maxattr, + nl802154_policy, NULL); /* TODO check if we can handle error here, * we have no backward compatibility @@ -693,7 +698,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg, switch (desc->mode) { case NL802154_KEY_ID_MODE_IMPLICIT: - nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT); + nl_dev_addr = nla_nest_start_noflag(msg, + NL802154_KEY_ID_ATTR_IMPLICIT); if (!nl_dev_addr) return -ENOBUFS; @@ -768,7 +774,7 @@ static int nl802154_get_llsec_params(struct sk_buff *msg, params.frame_counter)) return -ENOBUFS; - nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID); + nl_key_id = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_OUT_KEY_ID); if (!nl_key_id) return -ENOBUFS; @@ -1306,8 +1312,7 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, { struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, - nl802154_dev_addr_policy, NULL)) + if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || @@ -1346,8 +1351,7 @@ ieee802154_llsec_parse_key_id(struct nlattr *nla, { struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla, - nl802154_key_id_policy, NULL)) + if (!nla || nla_parse_nested_deprecated(attrs, NL802154_KEY_ID_ATTR_MAX, nla, nl802154_key_id_policy, NULL)) return -EINVAL; if (!attrs[NL802154_KEY_ID_ATTR_MODE]) @@ -1455,11 +1459,11 @@ static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY); + nl_key = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_KEY); if (!nl_key) goto nla_put_failure; - nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID); + nl_key_id = nla_nest_start_noflag(msg, NL802154_KEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; @@ -1562,9 +1566,7 @@ static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) struct ieee802154_llsec_key_id id = { }; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; - if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, - info->attrs[NL802154_ATTR_SEC_KEY], - nl802154_key_policy, info->extack)) + if (nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || @@ -1612,9 +1614,7 @@ static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key_id id; - if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, - info->attrs[NL802154_ATTR_SEC_KEY], - nl802154_key_policy, info->extack)) + if (nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) @@ -1639,7 +1639,7 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE); + nl_device = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVICE); if (!nl_device) goto nla_put_failure; @@ -1728,8 +1728,7 @@ ieee802154_llsec_parse_device(struct nlattr *nla, { struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, - nla, nl802154_dev_policy, NULL)) + if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, nla, nl802154_dev_policy, NULL)) return -EINVAL; memset(dev, 0, sizeof(*dev)); @@ -1780,9 +1779,7 @@ static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; __le64 extended_addr; - if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, - info->attrs[NL802154_ATTR_SEC_DEVICE], - nl802154_dev_policy, info->extack)) + if (nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) @@ -1808,7 +1805,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY); + nl_devkey = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVKEY); if (!nl_devkey) goto nla_put_failure; @@ -1818,7 +1815,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, devkey->frame_counter)) goto nla_put_failure; - nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID); + nl_key_id = nla_nest_start_noflag(msg, NL802154_DEVKEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; @@ -1908,9 +1905,7 @@ static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info __le64 extended_addr; if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || - nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, - info->attrs[NL802154_ATTR_SEC_DEVKEY], - nl802154_devkey_policy, info->extack) < 0) + nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack) < 0) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || @@ -1940,9 +1935,7 @@ static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info struct ieee802154_llsec_device_key key; __le64 extended_addr; - if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, - info->attrs[NL802154_ATTR_SEC_DEVKEY], - nl802154_devkey_policy, info->extack)) + if (nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) @@ -1976,7 +1969,7 @@ static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL); + nl_seclevel = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_LEVEL); if (!nl_seclevel) goto nla_put_failure; @@ -2062,8 +2055,7 @@ llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) { struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, - nla, nl802154_seclevel_policy, NULL)) + if (!nla || nla_parse_nested_deprecated(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, nl802154_seclevel_policy, NULL)) return -EINVAL; memset(sl, 0, sizeof(*sl)); @@ -2217,131 +2209,131 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, - .policy = nl802154_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_get_interface, .dumpit = nl802154_dump_interface, - .policy = nl802154_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_new_interface, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_interface, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CHANNEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_channel, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_MODE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_mode, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_ed_level, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_TX_POWER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_tx_power, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_wpan_phy_netns, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_PAN_ID, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_pan_id, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_SHORT_ADDR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_short_addr, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_BACKOFF_EXPONENT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_backoff_exponent, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_csma_backoffs, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_frame_retries, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_LBT_MODE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_lbt_mode, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_ackreq_default, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, @@ -2349,33 +2341,33 @@ static const struct genl_ops nl802154_ops[] = { #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL { .cmd = NL802154_CMD_SET_SEC_PARAMS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_llsec_params, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_key, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_key, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, @@ -2383,25 +2375,25 @@ static const struct genl_ops nl802154_ops[] = { /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEV, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_dev, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEV, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_dev, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, @@ -2409,51 +2401,51 @@ static const struct genl_ops nl802154_ops[] = { /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEVKEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_devkey, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEVKEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_devkey, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_LEVEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_seclevel, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_LEVEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO match frame_type only? */ .doit = nl802154_del_llsec_seclevel, - .policy = nl802154_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, @@ -2466,6 +2458,7 @@ static struct genl_family nl802154_fam __ro_after_init = { .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ .maxattr = NL802154_ATTR_MAX, + .policy = nl802154_policy, .netnsok = true, .pre_doit = nl802154_pre_doit, .post_doit = nl802154_post_doit, diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index bc6b912603f1..ce2dfb997537 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -164,10 +164,6 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, struct sock *sk = sock->sk; switch (cmd) { - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, @@ -426,6 +422,7 @@ static const struct proto_ops ieee802154_raw_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, @@ -988,6 +985,7 @@ static const struct proto_ops ieee802154_dgram_ops = { .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 32cae39cdff6..8108e97d4285 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -304,7 +304,7 @@ config NET_IPVTI tristate "Virtual (secure) IP: tunneling" select INET_TUNNEL select NET_IP_TUNNEL - depends on INET_XFRM_MODE_TUNNEL + select XFRM ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -396,33 +396,6 @@ config INET_TUNNEL tristate default n -config INET_XFRM_MODE_TRANSPORT - tristate "IP: IPsec transport mode" - default y - select XFRM - ---help--- - Support for IPsec transport mode. - - If unsure, say Y. - -config INET_XFRM_MODE_TUNNEL - tristate "IP: IPsec tunnel mode" - default y - select XFRM - ---help--- - Support for IPsec tunnel mode. - - If unsure, say Y. - -config INET_XFRM_MODE_BEET - tristate "IP: IPsec BEET mode" - default y - select XFRM - ---help--- - Support for IPsec BEET mode. - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 58629314eae9..000a61994c8f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -37,10 +37,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o -obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o -obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o -obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eab3ebde981e..5183a2daba64 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_rx_skb_cache) { + __kfree_skb(sk->sk_rx_skb_cache); + sk->sk_rx_skb_cache = NULL; + } __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim(sk); @@ -156,7 +160,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); - dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); + dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(sk->sk_rx_dst); sk_refcnt_debug_dec(sk); } @@ -911,12 +915,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct rtentry rt; switch (cmd) { - case SIOCGSTAMP: - err = sock_get_timestamp(sk, (struct timeval __user *)arg); - break; - case SIOCGSTAMPNS: - err = sock_get_timestampns(sk, (struct timespec __user *)arg); - break; case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) @@ -988,6 +986,7 @@ const struct proto_ops inet_stream_ops = { .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1023,6 +1022,7 @@ const struct proto_ops inet_dgram_ops = { .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, @@ -1055,6 +1055,7 @@ static const struct proto_ops inet_sockraw_ops = { .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c index 1e976bb93d99..15427163a041 100644 --- a/net/ipv4/bpfilter/sockopt.c +++ b/net/ipv4/bpfilter/sockopt.c @@ -77,5 +77,4 @@ static int __init bpfilter_sockopt_init(void) return 0; } - -module_init(bpfilter_sockopt_init); +device_initcall(bpfilter_sockopt_init); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index eb514f312e6f..701c5d113a34 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -621,8 +621,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) goto errout; @@ -793,8 +793,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, struct in_device *in_dev; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) goto errout; @@ -1689,8 +1689,8 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, fillargs->flags |= NLM_F_DUMP_FILTERED; } - err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); if (err < 0) return err; @@ -1906,7 +1906,8 @@ static int inet_validate_link_af(const struct net_device *dev, if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; - err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); + err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, + inet_af_policy, NULL); if (err < 0) return err; @@ -1934,7 +1935,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!in_dev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET_CONF]) { @@ -2076,11 +2077,13 @@ static int inet_netconf_valid_get_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv4_policy, extack); - err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv4_policy, extack); if (err) return err; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 10e809b296ec..fb065a8937ea 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -226,7 +226,7 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { int encap_type; struct udphdr *uh; @@ -234,6 +234,7 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru __be16 sport, dport; struct xfrm_encap_tmpl *encap = x->encap; struct ip_esp_hdr *esph = esp->esph; + unsigned int len; spin_lock_bh(&x->lock); sport = encap->encap_sport; @@ -241,11 +242,14 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru encap_type = encap->encap_type; spin_unlock_bh(&x->lock); + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len + sizeof(struct iphdr) >= IP_MAX_MTU) + return -EMSGSIZE; + uh = (struct udphdr *)esph; uh->source = sport; uh->dest = dport; - uh->len = htons(skb->len + esp->tailen - - skb_transport_offset(skb)); + uh->len = htons(len); uh->check = 0; switch (encap_type) { @@ -262,6 +266,8 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru *skb_mac_header(skb) = IPPROTO_UDP; esp->esph = esph; + + return 0; } int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) @@ -275,8 +281,12 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * int tailen = esp->tailen; /* this is non-NULL only with UDP Encapsulation */ - if (x->encap) - esp_output_udp_encap(x, skb, esp); + if (x->encap) { + int err = esp_output_udp_encap(x, skb, esp); + + if (err < 0) + return err; + } if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8756e0e790d2..8edcfa66d1e5 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -52,13 +52,13 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, goto out; if (sp->len == XFRM_MAX_DEPTH) - goto out; + goto out_reset; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (!x) - goto out; + goto out_reset; sp->xvec[sp->len++] = x; sp->olen++; @@ -66,7 +66,7 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo) { xfrm_state_put(x); - goto out; + goto out_reset; } } @@ -82,6 +82,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xfrm_input(skb, IPPROTO_ESP, spi, -2); return ERR_PTR(-EINPROGRESS); +out_reset: + secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; @@ -107,6 +109,44 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) xo->proto = proto; } +static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); +} + +static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + switch (x->outer_mode.encap) { + case XFRM_MODE_TUNNEL: + return xfrm4_tunnel_gso_segment(x, skb, features); + case XFRM_MODE_TRANSPORT: + return xfrm4_transport_gso_segment(x, skb, features); + } + + return ERR_PTR(-EOPNOTSUPP); +} + static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -138,14 +178,16 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) + if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && + !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && + !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; - return x->outer_mode->gso_segment(x, skb, esp_features); + return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -181,7 +223,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ if (!xo) return -EINVAL; - if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) { + if ((!(features & NETIF_F_HW_ESP) && + !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || + x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ed14ec245584..b298255f6fdb 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -307,7 +307,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) - return FIB_RES_PREFSRC(net, res); + return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } @@ -324,16 +324,16 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) for (ret = 0; ret < fi->fib_nhs; ret++) { struct fib_nh *nh = &fi->fib_nh[ret]; - if (nh->nh_dev == dev) { + if (nh->fib_nh_dev == dev) { dev_match = true; break; - } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + } else if (l3mdev_master_ifindex_rcu(nh->fib_nh_dev) == dev->ifindex) { dev_match = true; break; } } #else - if (fi->fib_nh[0].nh_dev == dev) + if (fi->fib_nh[0].fib_nh_dev == dev) dev_match = true; #endif @@ -390,7 +390,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = fib_info_nh_uses_dev(res.fi, dev); if (dev_match) { - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) @@ -402,7 +402,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; @@ -558,7 +558,8 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; - cfg->fc_gw = addr; + cfg->fc_gw4 = addr; + cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) @@ -568,7 +569,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, if (cmd == SIOCDELRT) return 0; - if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) + if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) @@ -664,16 +665,61 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DPORT] = { .type = NLA_U16 }, }; +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + struct rtvia *via; + int alen; + + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { + NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); + return -EINVAL; + } + + via = nla_data(nla); + alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (alen != sizeof(__be32)) { + NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET; + cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); + break; + case AF_INET6: +#ifdef CONFIG_IPV6 + if (alen != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); + return -EINVAL; + } + cfg->fc_gw_family = AF_INET6; + cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); +#else + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EINVAL; +#endif + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); + return -EINVAL; + } + + return 0; +} + static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { + bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; - err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, - extack); + err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, + rtm_ipv4_policy, extack); if (err < 0) goto errout; @@ -708,12 +754,17 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: - cfg->fc_gw = nla_get_be32(attr); + has_gw = true; + cfg->fc_gw4 = nla_get_be32(attr); + if (cfg->fc_gw4) + cfg->fc_gw_family = AF_INET; break; case RTA_VIA: - NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); - err = -EINVAL; - goto errout; + has_via = true; + err = fib_gw_from_via(cfg, attr, extack); + if (err) + goto errout; + break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; @@ -752,6 +803,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (has_gw && has_via) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + goto errout; + } + return 0; errout: return err; @@ -839,8 +896,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err < 0) return err; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index e6ff282bb7f4..7945f0534db7 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -45,6 +45,7 @@ static inline void fib_result_assign(struct fib_result *res, { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; + res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8e185b5a2bf6..d3da6a10f86f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -41,10 +41,12 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/ip6_fib.h> #include <net/netlink.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> +#include <net/addrconf.h> #include "fib_lookup.h" @@ -157,12 +159,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) dst_release_immediate(&rt->dst); } -static void free_nh_exceptions(struct fib_nh *nh) +static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; - hash = rcu_dereference_protected(nh->nh_exceptions, 1); + hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { @@ -204,18 +206,34 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) free_percpu(rtp); } +void fib_nh_common_release(struct fib_nh_common *nhc) +{ + if (nhc->nhc_dev) + dev_put(nhc->nhc_dev); + + lwtstate_put(nhc->nhc_lwtstate); + rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); + rt_fibinfo_free(&nhc->nhc_rth_input); + free_nh_exceptions(nhc); +} +EXPORT_SYMBOL_GPL(fib_nh_common_release); + +void fib_nh_release(struct net *net, struct fib_nh *fib_nh) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + if (fib_nh->nh_tclassid) + net->ipv4.fib_num_tclassid_users--; +#endif + fib_nh_common_release(&fib_nh->nh_common); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); change_nexthops(fi) { - if (nexthop_nh->nh_dev) - dev_put(nexthop_nh->nh_dev); - lwtstate_put(nexthop_nh->nh_lwtstate); - free_nh_exceptions(nexthop_nh); - rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); - rt_fibinfo_free(&nexthop_nh->nh_rth_input); + fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); ip_fib_metrics_put(fi->fib_metrics); @@ -230,12 +248,7 @@ void free_fib_info(struct fib_info *fi) return; } fib_info_cnt--; -#ifdef CONFIG_IP_ROUTE_CLASSID - change_nexthops(fi) { - if (nexthop_nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users--; - } endfor_nexthops(fi); -#endif + call_rcu(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); @@ -248,7 +261,7 @@ void fib_release_info(struct fib_info *fi) if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); change_nexthops(fi) { - if (!nexthop_nh->nh_dev) + if (!nexthop_nh->fib_nh_dev) continue; hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) @@ -263,18 +276,27 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) const struct fib_nh *onh = ofi->fib_nh; for_nexthops(fi) { - if (nh->nh_oif != onh->nh_oif || - nh->nh_gw != onh->nh_gw || - nh->nh_scope != onh->nh_scope || + if (nh->fib_nh_oif != onh->fib_nh_oif || + nh->fib_nh_gw_family != onh->fib_nh_gw_family || + nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH - nh->nh_weight != onh->nh_weight || + nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) + lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || + ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) + return -1; + + if (nh->fib_nh_gw_family == AF_INET && + nh->fib_nh_gw4 != onh->fib_nh_gw4) return -1; + + if (nh->fib_nh_gw_family == AF_INET6 && + ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) + return -1; + onh++; } endfor_nexthops(fi); return 0; @@ -298,7 +320,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->nh_oif); + val ^= fib_devindex_hashfn(nh->fib_nh_oif); } endfor_nexthops(fi) return (val ^ (val >> 7) ^ (val >> 12)) & mask; @@ -347,9 +369,9 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->nh_dev == dev && - nh->nh_gw == gw && - !(nh->nh_flags & RTNH_F_DEAD)) { + if (nh->fib_nh_dev == dev && + nh->fib_nh_gw4 == gw && + !(nh->fib_nh_flags & RTNH_F_DEAD)) { spin_unlock(&fib_info_lock); return 0; } @@ -384,10 +406,10 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* grab encap info */ for_nexthops(fi) { - if (nh->nh_lwtstate) { + if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( - nh->nh_lwtstate); + nh->fib_nh_lws); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } @@ -435,10 +457,18 @@ static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + if (likely(nhc->nhc_gw_family == AF_INET)) + n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); + else if (nhc->nhc_gw_family == AF_INET6) + n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, + nhc->nhc_dev); + else + n = NULL; + if (n) { state = n->nud_state; neigh_release(n); @@ -457,6 +487,75 @@ static int fib_detect_death(struct fib_info *fi, int order, return 1; } +int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap, + u16 encap_type, void *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + int err; + + nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, + gfp_flags); + if (!nhc->nhc_pcpu_rth_output) + return -ENOMEM; + + if (encap) { + struct lwtunnel_state *lwtstate; + + if (encap_type == LWTUNNEL_ENCAP_NONE) { + NL_SET_ERR_MSG(extack, "LWT encap type not specified"); + err = -EINVAL; + goto lwt_failure; + } + err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family, + cfg, &lwtstate, extack); + if (err) + goto lwt_failure; + + nhc->nhc_lwtstate = lwtstate_get(lwtstate); + } + + return 0; + +lwt_failure: + rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); + nhc->nhc_pcpu_rth_output = NULL; + return err; +} +EXPORT_SYMBOL_GPL(fib_nh_common_init); + +int fib_nh_init(struct net *net, struct fib_nh *nh, + struct fib_config *cfg, int nh_weight, + struct netlink_ext_ack *extack) +{ + int err; + + nh->fib_nh_family = AF_INET; + + err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap, + cfg->fc_encap_type, cfg, GFP_KERNEL, extack); + if (err) + return err; + + nh->fib_nh_oif = cfg->fc_oif; + nh->fib_nh_gw_family = cfg->fc_gw_family; + if (cfg->fc_gw_family == AF_INET) + nh->fib_nh_gw4 = cfg->fc_gw4; + else if (cfg->fc_gw_family == AF_INET6) + nh->fib_nh_gw6 = cfg->fc_gw6; + + nh->fib_nh_flags = cfg->fc_flags; + +#ifdef CONFIG_IP_ROUTE_CLASSID + nh->nh_tclassid = cfg->fc_flow; + if (nh->nh_tclassid) + net->ipv4.fib_num_tclassid_users++; +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->fib_nh_weight = nh_weight; +#endif + return 0; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, @@ -483,11 +582,15 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { + struct net *net = fi->fib_net; + struct fib_config fib_cfg; int ret; change_nexthops(fi) { int attrlen; + memset(&fib_cfg, 0, sizeof(fib_cfg)); + if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); @@ -500,56 +603,73 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, return -EINVAL; } - nexthop_nh->nh_flags = - (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; - nexthop_nh->nh_oif = rtnh->rtnh_ifindex; - nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; + fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0; -#ifdef CONFIG_IP_ROUTE_CLASSID - nla = nla_find(attrs, attrlen, RTA_FLOW); - nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; - if (nexthop_nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users++; -#endif - nla = nla_find(attrs, attrlen, RTA_ENCAP); + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } if (nla) { - struct lwtunnel_state *lwtstate; - struct nlattr *nla_entype; - - nla_entype = nla_find(attrs, attrlen, - RTA_ENCAP_TYPE); - if (!nla_entype) { - NL_SET_BAD_ATTR(extack, nla); - NL_SET_ERR_MSG(extack, - "Encap type is missing"); - goto err_inval; - } - - ret = lwtunnel_build_state(nla_get_u16( - nla_entype), - nla, AF_INET, cfg, - &lwtstate, extack); + fib_cfg.fc_gw4 = nla_get_in_addr(nla); + if (fib_cfg.fc_gw4) + fib_cfg.fc_gw_family = AF_INET; + } else if (nlav) { + ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) goto errout; - nexthop_nh->nh_lwtstate = - lwtstate_get(lwtstate); } + + nla = nla_find(attrs, attrlen, RTA_FLOW); + if (nla) + fib_cfg.fc_flow = nla_get_u32(nla); + + fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + fib_cfg.fc_encap_type = nla_get_u16(nla); } + ret = fib_nh_init(net, nexthop_nh, &fib_cfg, + rtnh->rtnh_hops + 1, extack); + if (ret) + goto errout; + rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); - return 0; - -err_inval: ret = -EINVAL; - + if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { + NL_SET_ERR_MSG(extack, + "Nexthop device index does not match RTA_OIF"); + goto errout; + } + if (cfg->fc_gw_family) { + if (cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family || + (cfg->fc_gw_family == AF_INET && + fi->fib_nh->fib_nh_gw4 != cfg->fc_gw4) || + (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&fi->fib_nh->fib_nh_gw6, &cfg->fc_gw6))) { + NL_SET_ERR_MSG(extack, + "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); + goto errout; + } + } +#ifdef CONFIG_IP_ROUTE_CLASSID + if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { + NL_SET_ERR_MSG(extack, + "Nexthop class id does not match RTA_FLOW"); + goto errout; + } +#endif + ret = 0; errout: return ret; } @@ -558,49 +678,51 @@ static void fib_rebalance(struct fib_info *fi) { int total; int w; - struct in_device *in_dev; if (fi->fib_nhs < 2) return; total = 0; for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) continue; - in_dev = __in_dev_get_rtnl(nh->nh_dev); - - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nh->nh_flags & RTNH_F_LINKDOWN) + if (ip_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; - total += nh->nh_weight; + total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; - in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); - - if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; - } else if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { + } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && + nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { - w += nexthop_nh->nh_weight; + w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } - atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); + atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ +static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, + int remaining, struct fib_config *cfg, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); + + return -EINVAL; +} + #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -620,7 +742,7 @@ static int fib_encap_match(u16 encap_type, ret = lwtunnel_build_state(encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { - result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } @@ -638,7 +760,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; - if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_oif || cfg->fc_gw_family) { if (cfg->fc_encap) { if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, fi->fib_nh, cfg, extack)) @@ -649,10 +771,20 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, cfg->fc_flow != fi->fib_nh->nh_tclassid) return 1; #endif - if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && - (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) - return 0; - return 1; + if ((cfg->fc_oif && cfg->fc_oif != fi->fib_nh->fib_nh_oif) || + (cfg->fc_gw_family && + cfg->fc_gw_family != fi->fib_nh->fib_nh_gw_family)) + return 1; + + if (cfg->fc_gw_family == AF_INET && + cfg->fc_gw4 != fi->fib_nh->fib_nh_gw4) + return 1; + + if (cfg->fc_gw_family == AF_INET6 && + ipv6_addr_cmp(&cfg->fc_gw6, &fi->fib_nh->fib_nh_gw6)) + return 1; + + return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -668,16 +800,48 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; - if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) + if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_in_addr(nla) != nh->nh_gw) - return 1; + nlav = nla_find(attrs, attrlen, RTA_VIA); + if (nla && nlav) { + NL_SET_ERR_MSG(extack, + "Nexthop configuration can not contain both GATEWAY and VIA"); + return -EINVAL; + } + + if (nla) { + if (nh->fib_nh_gw_family != AF_INET || + nla_get_in_addr(nla) != nh->fib_nh_gw4) + return 1; + } else if (nlav) { + struct fib_config cfg2; + int err; + + err = fib_gw_from_via(&cfg2, nlav, extack); + if (err) + return err; + + switch (nh->fib_nh_gw_family) { + case AF_INET: + if (cfg2.fc_gw_family != AF_INET || + cfg2.fc_gw4 != nh->fib_nh_gw4) + return 1; + break; + case AF_INET6: + if (cfg2.fc_gw_family != AF_INET6 || + ipv6_addr_cmp(&cfg2.fc_gw6, + &nh->fib_nh_gw6)) + return 1; + break; + } + } + #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) @@ -731,6 +895,30 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) return true; } +static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, + u32 table, struct netlink_ext_ack *extack) +{ + struct fib6_config cfg = { + .fc_table = table, + .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, + .fc_ifindex = nh->fib_nh_oif, + .fc_gateway = nh->fib_nh_gw6, + }; + struct fib6_nh fib6_nh = {}; + int err; + + err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); + if (!err) { + nh->fib_nh_dev = fib6_nh.fib_nh_dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_oif = nh->fib_nh_dev->ifindex; + nh->fib_nh_scope = RT_SCOPE_LINK; + + ipv6_stub->fib6_nh_release(&fib6_nh); + } + + return err; +} /* * Picture @@ -775,133 +963,152 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, - struct netlink_ext_ack *extack) +static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, + u8 scope, struct netlink_ext_ack *extack) { - int err = 0; - struct net *net; struct net_device *dev; + struct fib_result res; + int err; - net = cfg->fc_nlinfo.nl_net; - if (nh->nh_gw) { - struct fib_result res; - - if (nh->nh_flags & RTNH_F_ONLINK) { - unsigned int addr_type; + if (nh->fib_nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; - if (cfg->fc_scope >= RT_SCOPE_LINK) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid scope"); - return -EINVAL; - } - dev = __dev_get_by_index(net, nh->nh_oif); - if (!dev) { - NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); - return -ENODEV; - } - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, - "Nexthop device is not up"); - return -ENETDOWN; - } - addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); - if (addr_type != RTN_UNICAST) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - return -EINVAL; - } - if (!netif_carrier_ok(dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; - nh->nh_dev = dev; - dev_hold(dev); - nh->nh_scope = RT_SCOPE_LINK; - return 0; + if (scope >= RT_SCOPE_LINK) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); + return -EINVAL; } - rcu_read_lock(); - { - struct fib_table *tbl = NULL; - struct flowi4 fl4 = { - .daddr = nh->nh_gw, - .flowi4_scope = cfg->fc_scope + 1, - .flowi4_oif = nh->nh_oif, - .flowi4_iif = LOOPBACK_IFINDEX, - }; - - /* It is not necessary, but requires a bit of thinking */ - if (fl4.flowi4_scope < RT_SCOPE_LINK) - fl4.flowi4_scope = RT_SCOPE_LINK; - - if (cfg->fc_table) - tbl = fib_get_table(net, cfg->fc_table); - - if (tbl) - err = fib_table_lookup(tbl, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE | - FIB_LOOKUP_NOREF); - - /* on error or if no table given do full lookup. This - * is needed for example when nexthops are in the local - * table rather than the given table - */ - if (!tbl || err) { - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); - } - - if (err) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway"); - rcu_read_unlock(); - return err; - } + dev = __dev_get_by_index(net, nh->fib_nh_oif); + if (!dev) { + NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); + return -ENODEV; } - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { - NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); - goto out; + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; } - nh->nh_scope = res.scope; - nh->nh_oif = FIB_RES_OIF(res); - nh->nh_dev = dev = FIB_RES_DEV(res); - if (!dev) { - NL_SET_ERR_MSG(extack, - "No egress device for nexthop gateway"); - goto out; + addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); + if (addr_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + return -EINVAL; } - dev_hold(dev); if (!netif_carrier_ok(dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; - err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; - } else { - struct in_device *in_dev; - - if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { - NL_SET_ERR_MSG(extack, - "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); - return -EINVAL; + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + nh->fib_nh_dev = dev; + dev_hold(dev); + nh->fib_nh_scope = RT_SCOPE_LINK; + return 0; + } + rcu_read_lock(); + { + struct fib_table *tbl = NULL; + struct flowi4 fl4 = { + .daddr = nh->fib_nh_gw4, + .flowi4_scope = scope + 1, + .flowi4_oif = nh->fib_nh_oif, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + + if (table) + tbl = fib_get_table(net, table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); } - rcu_read_lock(); - err = -ENODEV; - in_dev = inetdev_by_index(net, nh->nh_oif); - if (!in_dev) - goto out; - err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } - nh->nh_dev = in_dev->dev; - dev_hold(nh->nh_dev); - nh->nh_scope = RT_SCOPE_HOST; - if (!netif_carrier_ok(nh->nh_dev)) - nh->nh_flags |= RTNH_F_LINKDOWN; - err = 0; } + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + goto out; + } + nh->fib_nh_scope = res.scope; + nh->fib_nh_oif = FIB_RES_OIF(res); + nh->fib_nh_dev = dev = FIB_RES_DEV(res); + if (!dev) { + NL_SET_ERR_MSG(extack, + "No egress device for nexthop gateway"); + goto out; + } + dev_hold(dev); + if (!netif_carrier_ok(dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } +static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct in_device *in_dev; + int err; + + if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { + NL_SET_ERR_MSG(extack, + "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); + return -EINVAL; + } + + rcu_read_lock(); + + err = -ENODEV; + in_dev = inetdev_by_index(net, nh->fib_nh_oif); + if (!in_dev) + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); + goto out; + } + + nh->fib_nh_dev = in_dev->dev; + dev_hold(nh->fib_nh_dev); + nh->fib_nh_scope = RT_SCOPE_HOST; + if (!netif_carrier_ok(nh->fib_nh_dev)) + nh->fib_nh_flags |= RTNH_F_LINKDOWN; + err = 0; +out: + rcu_read_unlock(); + return err; +} + +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + u32 table = cfg->fc_table; + int err; + + if (nh->fib_nh_gw_family == AF_INET) + err = fib_check_nh_v4_gw(net, nh, table, cfg->fc_scope, extack); + else if (nh->fib_nh_gw_family == AF_INET6) + err = fib_check_nh_v6_gw(net, nh, table, extack); + else + err = fib_check_nh_nongw(net, nh, extack); + + return err; +} + static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_info_hash_size - 1); @@ -986,14 +1193,29 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) { - nh->nh_saddr = inet_select_addr(nh->nh_dev, - nh->nh_gw, + nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, + nh->fib_nh_gw4, nh->nh_parent->fib_scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; } +__be32 fib_result_prefsrc(struct net *net, struct fib_result *res) +{ + struct fib_nh_common *nhc = res->nhc; + struct fib_nh *nh; + + if (res->fi->fib_prefsrc) + return res->fi->fib_prefsrc; + + nh = container_of(nhc, struct fib_nh, nh_common); + if (nh->nh_saddr_genid == atomic_read(&net->ipv4.dev_addr_genid)) + return nh->nh_saddr; + + return fib_info_update_nh_saddr(net, nh); +} + static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || @@ -1096,72 +1318,18 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; - nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); - if (!nexthop_nh->nh_pcpu_rth_output) - goto failure; } endfor_nexthops(fi) - if (cfg->fc_mp) { -#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); - if (err != 0) - goto failure; - if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) { - NL_SET_ERR_MSG(extack, - "Nexthop device index does not match RTA_OIF"); - goto err_inval; - } - if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) { - NL_SET_ERR_MSG(extack, - "Nexthop gateway does not match RTA_GATEWAY"); - goto err_inval; - } -#ifdef CONFIG_IP_ROUTE_CLASSID - if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { - NL_SET_ERR_MSG(extack, - "Nexthop class id does not match RTA_FLOW"); - goto err_inval; - } -#endif -#else - NL_SET_ERR_MSG(extack, - "Multipath support not enabled in kernel"); - goto err_inval; -#endif - } else { - struct fib_nh *nh = fi->fib_nh; - - if (cfg->fc_encap) { - struct lwtunnel_state *lwtstate; - - if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { - NL_SET_ERR_MSG(extack, - "LWT encap type not specified"); - goto err_inval; - } - err = lwtunnel_build_state(cfg->fc_encap_type, - cfg->fc_encap, AF_INET, cfg, - &lwtstate, extack); - if (err) - goto failure; + else + err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); - nh->nh_lwtstate = lwtstate_get(lwtstate); - } - nh->nh_oif = cfg->fc_oif; - nh->nh_gw = cfg->fc_gw; - nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_IP_ROUTE_CLASSID - nh->nh_tclassid = cfg->fc_flow; - if (nh->nh_tclassid) - fi->fib_net->ipv4.fib_num_tclassid_users++; -#endif -#ifdef CONFIG_IP_ROUTE_MULTIPATH - nh->nh_weight = 1; -#endif - } + if (err != 0) + goto failure; if (fib_props[cfg->fc_type].error) { - if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { + if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; @@ -1195,15 +1363,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg, "Route with host scope can not have multiple nexthops"); goto err_inval; } - if (nh->nh_gw) { + if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } - nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); + nh->fib_nh_scope = RT_SCOPE_NOWHERE; + nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); err = -ENODEV; - if (!nh->nh_dev) + if (!nh->fib_nh_dev) goto failure; } else { int linkdown = 0; @@ -1212,7 +1380,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; - if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) + if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) @@ -1226,6 +1394,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); + if (nexthop_nh->fib_nh_gw_family == AF_INET6) + fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); @@ -1254,9 +1424,9 @@ link_it: struct hlist_head *head; unsigned int hash; - if (!nexthop_nh->nh_dev) + if (!nexthop_nh->fib_nh_dev) continue; - hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); head = &fib_info_devhash[hash]; hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) @@ -1275,6 +1445,141 @@ failure: return ERR_PTR(err); } +int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, + unsigned char *flags, bool skip_oif) +{ + if (nhc->nhc_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + + if (nhc->nhc_flags & RTNH_F_LINKDOWN) { + *flags |= RTNH_F_LINKDOWN; + + rcu_read_lock(); + switch (nhc->nhc_family) { + case AF_INET: + if (ip_ignore_linkdown(nhc->nhc_dev)) + *flags |= RTNH_F_DEAD; + break; + case AF_INET6: + if (ip6_ignore_linkdown(nhc->nhc_dev)) + *flags |= RTNH_F_DEAD; + break; + } + rcu_read_unlock(); + } + + switch (nhc->nhc_gw_family) { + case AF_INET: + if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) + goto nla_put_failure; + break; + case AF_INET6: + /* if gateway family does not match nexthop family + * gateway is encoded as RTA_VIA + */ + if (nhc->nhc_gw_family != nhc->nhc_family) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); + } else if (nla_put_in6_addr(skb, RTA_GATEWAY, + &nhc->nhc_gw.ipv6) < 0) { + goto nla_put_failure; + } + break; + } + + *flags |= (nhc->nhc_flags & RTNH_F_ONLINK); + if (nhc->nhc_flags & RTNH_F_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + + if (!skip_oif && nhc->nhc_dev && + nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) + goto nla_put_failure; + + if (nhc->nhc_lwtstate && + lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, + RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(fib_nexthop_info); + +#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) +int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, + int nh_weight) +{ + const struct net_device *dev = nhc->nhc_dev; + struct rtnexthop *rtnh; + unsigned char flags = 0; + + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); + if (!rtnh) + goto nla_put_failure; + + rtnh->rtnh_hops = nh_weight - 1; + rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; + + if (fib_nexthop_info(skb, nhc, &flags, true) < 0) + goto nla_put_failure; + + rtnh->rtnh_flags = flags; + + /* length of rtnetlink header + attributes */ + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(fib_add_nexthop); +#endif + +#ifdef CONFIG_IP_ROUTE_MULTIPATH +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) +{ + struct nlattr *mp; + + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + for_nexthops(fi) { + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0) + goto nla_put_failure; +#ifdef CONFIG_IP_ROUTE_CLASSID + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) + goto nla_put_failure; +#endif + } endfor_nexthops(fi); + + nla_nest_end(skb, mp); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +#else +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) +{ + return 0; +} +#endif + int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) @@ -1315,80 +1620,23 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { - if (fi->fib_nh->nh_gw && - nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) - goto nla_put_failure; - if (fi->fib_nh->nh_oif && - nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) + struct fib_nh *nh = &fi->fib_nh[0]; + unsigned char flags = 0; + + if (fib_nexthop_info(skb, &nh->nh_common, &flags, false) < 0) goto nla_put_failure; - if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev); - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) - rtm->rtm_flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD) - rtm->rtm_flags |= RTNH_F_OFFLOAD; + + rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID - if (fi->fib_nh[0].nh_tclassid && - nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) + if (nh->nh_tclassid && + nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif - if (fi->fib_nh->nh_lwtstate && - lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0) + } else { + if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (fi->fib_nhs > 1) { - struct rtnexthop *rtnh; - struct nlattr *mp; - mp = nla_nest_start(skb, RTA_MULTIPATH); - if (!mp) - goto nla_put_failure; - - for_nexthops(fi) { - rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); - if (!rtnh) - goto nla_put_failure; - - rtnh->rtnh_flags = nh->nh_flags & 0xFF; - if (nh->nh_flags & RTNH_F_LINKDOWN) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(nh->nh_dev); - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) - rtnh->rtnh_flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - rtnh->rtnh_hops = nh->nh_weight - 1; - rtnh->rtnh_ifindex = nh->nh_oif; - - if (nh->nh_gw && - nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw)) - goto nla_put_failure; -#ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; -#endif - if (nh->nh_lwtstate && - lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0) - goto nla_put_failure; - - /* length of rtnetlink header + attributes */ - rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; - } endfor_nexthops(fi); - - nla_nest_end(skb, mp); - } -#endif nlmsg_end(skb, nlh); return 0; @@ -1427,28 +1675,26 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) return ret; } -static int call_fib_nh_notifiers(struct fib_nh *fib_nh, +static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { - struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev); + bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { - .fib_nh = fib_nh, + .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: - if (fib_nh->nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) break; - if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - fib_nh->nh_flags & RTNH_F_LINKDOWN) + if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; - return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type, + return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - fib_nh->nh_flags & RTNH_F_LINKDOWN) || - (fib_nh->nh_flags & RTNH_F_DEAD)) - return call_fib4_notifiers(dev_net(fib_nh->nh_dev), + if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || + (nh->fib_nh_flags & RTNH_F_DEAD)) + return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); default: break; @@ -1467,12 +1713,12 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ -static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) +static void nh_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; - bucket = rcu_dereference_protected(nh->nh_exceptions, 1); + bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; @@ -1502,8 +1748,8 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->nh_dev == dev) - nh_update_mtu(nh, dev->mtu, orig_mtu); + if (nh->fib_nh_dev == dev) + nh_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1530,22 +1776,22 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) int dead; BUG_ON(!fi->fib_nhs); - if (nh->nh_dev != dev || fi == prev_fi) + if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nexthop_nh->nh_flags & RTNH_F_DEAD) + if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; - else if (nexthop_nh->nh_dev == dev && - nexthop_nh->nh_scope != scope) { + else if (nexthop_nh->fib_nh_dev == dev && + nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: - nexthop_nh->nh_flags |= RTNH_F_DEAD; + nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; /* fall through */ case NETDEV_CHANGE: - nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; + nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, @@ -1554,7 +1800,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && - nexthop_nh->nh_dev == dev) { + nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } @@ -1614,8 +1860,8 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + if (!next_fi->fib_nh[0].fib_nh_gw4 || + next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); @@ -1658,7 +1904,7 @@ out: * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. */ -int fib_sync_up(struct net_device *dev, unsigned int nh_flags) +int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; unsigned int hash; @@ -1686,24 +1932,24 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) int alive; BUG_ON(!fi->fib_nhs); - if (nh->nh_dev != dev || fi == prev_fi) + if (nh->fib_nh_dev != dev || fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & nh_flags)) { + if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } - if (!nexthop_nh->nh_dev || - !(nexthop_nh->nh_dev->flags & IFF_UP)) + if (!nexthop_nh->fib_nh_dev || + !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; - if (nexthop_nh->nh_dev != dev || + if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; - nexthop_nh->nh_flags &= ~nh_flags; + nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) @@ -1723,13 +1969,19 @@ static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; - if (nh->nh_scope == RT_SCOPE_LINK) { + if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->nh_dev, - (__force u32)nh->nh_gw); + if (likely(nh->fib_nh_gw_family == AF_INET)) + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + else if (nh->fib_nh_gw_family == AF_INET6) + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, + &nh->fib_nh_gw6); + else + n = NULL; if (n) state = n->nud_state; @@ -1745,20 +1997,22 @@ void fib_select_multipath(struct fib_result *res, int hash) struct net *net = fi->fib_net; bool first = false; - for_nexthops(fi) { + change_nexthops(fi) { if (net->ipv4.sysctl_fib_multipath_use_neigh) { - if (!fib_good_nh(nh)) + if (!fib_good_nh(nexthop_nh)) continue; if (!first) { res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; first = true; } } - if (hash > atomic_read(&nh->nh_upper_bound)) + if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; return; } endfor_nexthops(fi); } @@ -1785,5 +2039,5 @@ void fib_select_path(struct net *net, struct fib_result *res, check_saddr: if (!fl4->saddr) - fl4->saddr = FIB_RES_PREFSRC(net, *res); + fl4->saddr = fib_result_prefsrc(net, res); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a573e37e0615..334f723bdf80 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -183,14 +183,16 @@ struct trie { }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); -static size_t tnode_free_size; +static unsigned int tnode_free_size; /* - * synchronize_rcu after call_rcu for that many pages; it should be especially - * useful before resizing the root node with PREEMPT_NONE configs; the value was - * obtained experimentally, aiming to avoid visible slowdown. + * synchronize_rcu after call_rcu for outstanding dirty memory; it should be + * especially useful before resizing the root node with PREEMPT_NONE configs; + * the value was obtained experimentally, aiming to avoid visible slowdown. */ -static const int sync_pages = 128; +unsigned int sysctl_fib_sync_mem = 512 * 1024; +unsigned int sysctl_fib_sync_mem_min = 64 * 1024; +unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; @@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn) tn = container_of(head, struct tnode, rcu)->kv; } - if (tnode_free_size >= PAGE_SIZE * sync_pages) { + if (tnode_free_size >= sysctl_fib_sync_mem) { tnode_free_size = 0; synchronize_rcu(); } @@ -1468,19 +1470,17 @@ found: if (fi->fib_flags & RTNH_F_DEAD) continue; for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); - if (nh->nh_flags & RTNH_F_DEAD) + if (nhc->nhc_flags & RTNH_F_DEAD) continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nh->nh_flags & RTNH_F_LINKDOWN && + if (ip_ignore_linkdown(nhc->nhc_dev) && + nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && - flp->flowi4_oif != nh->nh_oif) + flp->flowi4_oif != nhc->nhc_oif) continue; } @@ -1490,6 +1490,7 @@ found: res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; + res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->fi = fi; @@ -1498,7 +1499,7 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup(tb->tb_id, flp, nh, err); + trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } @@ -2651,7 +2652,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->nh_gw) + if (fi && fi->fib_nh->fib_nh_gw4) flags |= RTF_GATEWAY; if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; @@ -2702,7 +2703,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) "%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_nh->fib_nh_gw4, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 79e98e21cdd7..ca95051317ed 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -121,6 +121,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) struct guehdr *guehdr; void *data; u16 doffset = 0; + u8 proto_ctype; if (!fou) return 1; @@ -136,7 +137,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ int prot; @@ -170,9 +171,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - hdrlen = sizeof(struct guehdr) + optlen; - - if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) + if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; @@ -212,13 +211,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); + proto_ctype = guehdr->proto_ctype; __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); if (iptunnel_pull_offloads(skb)) goto drop; - return -guehdr->proto_ctype; + return -proto_ctype; drop: kfree_skb(skb); @@ -499,15 +499,45 @@ out_unlock: return err; } -static int fou_add_to_port_list(struct net *net, struct fou *fou) +static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg) +{ + struct sock *sk = fou->sock->sk; + struct udp_port_cfg *udp_cfg = &cfg->udp_config; + + if (fou->family != udp_cfg->family || + fou->port != udp_cfg->local_udp_port || + sk->sk_dport != udp_cfg->peer_udp_port || + sk->sk_bound_dev_if != udp_cfg->bind_ifindex) + return false; + + if (fou->family == AF_INET) { + if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr || + sk->sk_daddr != udp_cfg->peer_ip.s_addr) + return false; + else + return true; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) || + ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6)) + return false; + else + return true; +#endif + } + + return false; +} + +static int fou_add_to_port_list(struct net *net, struct fou *fou, + struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (fou->port == fout->port && - fou->family == fout->family) { + if (fou_cfg_cmp(fout, cfg)) { mutex_unlock(&fn->fou_lock); return -EALREADY; } @@ -585,7 +615,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - err = fou_add_to_port_list(net, fou); + err = fou_add_to_port_list(net, fou, cfg); if (err) goto error; @@ -605,14 +635,12 @@ error: static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); - __be16 port = cfg->udp_config.local_udp_port; - u8 family = cfg->udp_config.family; int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { - if (fou->port == port && fou->family == family) { + if (fou_cfg_cmp(fou, cfg)) { fou_release(fou); err = 0; break; @@ -626,16 +654,27 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, - [FOU_ATTR_AF] = { .type = NLA_U8, }, - [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, - [FOU_ATTR_TYPE] = { .type = NLA_U8, }, - [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, + [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, + [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, + [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, + [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { + bool has_local = false, has_peer = false; + struct nlattr *attr; + int ifindex; + __be16 port; + memset(cfg, 0, sizeof(*cfg)); cfg->udp_config.family = AF_INET; @@ -657,8 +696,7 @@ static int parse_nl_config(struct genl_info *info, } if (info->attrs[FOU_ATTR_PORT]) { - __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); - + port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } @@ -671,6 +709,52 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; + if (cfg->udp_config.family == AF_INET) { + if (info->attrs[FOU_ATTR_LOCAL_V4]) { + attr = info->attrs[FOU_ATTR_LOCAL_V4]; + cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V4]) { + attr = info->attrs[FOU_ATTR_PEER_V4]; + cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr); + has_peer = true; + } +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (info->attrs[FOU_ATTR_LOCAL_V6]) { + attr = info->attrs[FOU_ATTR_LOCAL_V6]; + cfg->udp_config.local_ip6 = nla_get_in6_addr(attr); + has_local = true; + } + + if (info->attrs[FOU_ATTR_PEER_V6]) { + attr = info->attrs[FOU_ATTR_PEER_V6]; + cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr); + has_peer = true; + } +#endif + } + + if (has_peer) { + if (info->attrs[FOU_ATTR_PEER_PORT]) { + port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]); + cfg->udp_config.peer_udp_port = port; + } else { + return -EINVAL; + } + } + + if (info->attrs[FOU_ATTR_IFINDEX]) { + if (!has_local) + return -EINVAL; + + ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]); + + cfg->udp_config.bind_ifindex = ifindex; + } + return 0; } @@ -702,15 +786,37 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) static int fou_fill_info(struct fou *fou, struct sk_buff *msg) { + struct sock *sk = fou->sock->sk; + if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || + nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) || nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || - nla_put_u8(msg, FOU_ATTR_TYPE, fou->type)) + nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) || + nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if)) return -1; if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) return -1; + + if (fou->sock->sk->sk_family == AF_INET) { + if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr)) + return -1; + + if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr)) + return -1; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6, + &sk->sk_v6_rcv_saddr)) + return -1; + + if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr)) + return -1; +#endif + } + return 0; } @@ -763,7 +869,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (port == fout->port && family == fout->family) { + if (fou_cfg_cmp(fout, &cfg)) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); @@ -807,21 +913,21 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) static const struct genl_ops fou_nl_ops[] = { { .cmd = FOU_CMD_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_add_port, - .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_rm_port, - .policy = fou_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = FOU_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = fou_nl_cmd_get_port, .dumpit = fou_nl_dump, - .policy = fou_nl_policy, }, }; @@ -830,6 +936,7 @@ static struct genl_family fou_nl_family __ro_after_init = { .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, + .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = fou_nl_ops, @@ -1033,7 +1140,7 @@ static int gue_err(struct sk_buff *skb, u32 info) case 0: /* Full GUE header present */ break; case 1: { - /* Direct encasulation of IPv4 or IPv6 */ + /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6ea523d71947..a175e3e7ae97 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -564,7 +564,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_gw_family) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..06f6f280b9ff 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_uses_gateway) + if (opt->is_strictroute && rt->rt_gw_family) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fd219f7bd3ea..4b0526441476 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -282,9 +281,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), @@ -292,8 +288,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, goto drop; if (tunnel->collect_md) { + struct erspan_metadata *pkt_md, *md; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -306,6 +303,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); md = ip_tunnel_info_opts(&tun_dst->u.tun_info); md->version = ver; md2 = &md->u.md2; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index ecce2dc78f17..ed97724c5e33 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -130,6 +130,7 @@ #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> @@ -188,6 +189,8 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } +INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; @@ -205,7 +208,8 @@ resubmit: } nf_reset(skb); } - ret = ipprot->handler(skb); + ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, + skb); if (ret < 0) { protocol = -ret; goto resubmit; @@ -257,11 +261,10 @@ int ip_local_deliver(struct sk_buff *skb) ip_local_deliver_finish); } -static inline bool ip_rcv_options(struct sk_buff *skb) +static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; - struct net_device *dev = skb->dev; /* It looks as overkill, because not all IP options require packet mangling. @@ -297,7 +300,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb) } } - if (ip_options_rcv_srr(skb)) + if (ip_options_rcv_srr(skb, dev)) goto drop; } @@ -306,6 +309,8 @@ drop: return true; } +INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev) { @@ -323,7 +328,8 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - err = edemux(skb); + err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, + udp_v4_early_demux, skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ @@ -353,7 +359,7 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, } #endif - if (iph->ihl > 5 && ip_rcv_options(skb)) + if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 32a35043c9f5..3db31bb9df50 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -612,7 +612,7 @@ void ip_forward_options(struct sk_buff *skb) } } -int ip_options_rcv_srr(struct sk_buff *skb) +int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; @@ -647,7 +647,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c80188875f39..ac880beda8a7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -188,7 +188,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; - u32 nexthop; + bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); @@ -218,16 +218,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } rcu_read_lock_bh(); - nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); - res = neigh_output(neigh, skb); - + /* if crossing protocols, can not use the cached header */ + res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock_bh(); return res; } @@ -472,7 +469,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ @@ -519,6 +516,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; + to->skb_iif = from->skb_iif; skb_dst_drop(to); skb_dst_copy(to, from); to->dev = from->dev; @@ -693,11 +691,8 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - while (frag) { - skb = frag->next; - kfree_skb(frag); - frag = skb; - } + kfree_skb_list(frag); + IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c3f3d28d1087..30c1c264bdfc 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -239,8 +239,8 @@ static int ip_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, - extack); + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr, + ip_tun_policy, extack); if (err < 0) return err; @@ -356,8 +356,8 @@ static int ip6_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, - extack); + err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr, + ip6_tun_policy, extack); if (err < 0) return err; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 68a21bf75dd0..254a42e83ff9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -50,7 +50,7 @@ static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) + int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); @@ -65,6 +65,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; + if (update_skb_dev) + skb->dev = tunnel->dev; + return xfrm_input(skb, nexthdr, spi, encap_type); } @@ -74,47 +77,28 @@ drop: return 0; } -static int vti_input_ipip(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type) +static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { - struct ip_tunnel *tunnel; - const struct iphdr *iph = ip_hdr(skb); - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); - if (tunnel) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; - - skb->dev = tunnel->dev; - - return xfrm_input(skb, nexthdr, spi, encap_type); - } - - return -EINVAL; -drop: - kfree_skb(skb); - return 0; + return vti_input(skb, nexthdr, spi, encap_type, false); } -static int vti_rcv(struct sk_buff *skb) +static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - return vti_input(skb, ip_hdr(skb)->protocol, 0, 0); + return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } -static int vti_rcv_ipip(struct sk_buff *skb) +static int vti_rcv_proto(struct sk_buff *skb) { - XFRM_SPI_SKB_CB(skb)->family = AF_INET; - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + return vti_rcv(skb, 0, false); +} - return vti_input_ipip(skb, ip_hdr(skb)->protocol, ip_hdr(skb)->saddr, 0); +static int vti_rcv_tunnel(struct sk_buff *skb) +{ + return vti_rcv(skb, ip_hdr(skb)->saddr, true); } static int vti_rcv_cb(struct sk_buff *skb, int err) @@ -123,7 +107,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; - struct xfrm_mode *inner_mode; + const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; @@ -142,7 +126,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); @@ -153,7 +137,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) } } - family = inner_mode->afinfo->family; + family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); @@ -447,31 +431,31 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { - .handler = vti_rcv, - .input_handler = vti_input, + .handler = vti_rcv_proto, + .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm_tunnel ipip_handler __read_mostly = { - .handler = vti_rcv_ipip, + .handler = vti_rcv_tunnel, .err_handler = vti4_err, .priority = 0, }; @@ -646,10 +630,8 @@ static int __init vti_init(void) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&ipip_handler, AF_INET); - if (err < 0) { - pr_info("%s: cant't register tunnel\n",__func__); + if (err < 0) goto xfrm_tunnel_failed; - } msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); @@ -659,9 +641,9 @@ static int __init vti_init(void) return err; rtnl_link_failed: - xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); -xfrm_tunnel_failed: xfrm4_tunnel_deregister(&ipip_handler, AF_INET); +xfrm_tunnel_failed: + xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: @@ -676,6 +658,7 @@ pernet_dev_failed: static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); + xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2c931120c494..2c61e10a60e3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -66,7 +66,7 @@ #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <linux/nospec.h> @@ -373,7 +373,6 @@ static const struct rhashtable_params ipmr_rht_params = { .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, - .locks_mul = 1, .obj_cmpfn = ipmr_hash_cmp, .automatic_shrinking = true, }; @@ -2499,8 +2498,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || @@ -2511,8 +2510,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err) return err; @@ -2675,8 +2674,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct rtmsg *rtm; int ret, rem; - ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, - extack); + ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, + rtm_ipmr_policy, extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); @@ -2784,7 +2783,7 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) return true; vif = &mrt->vif_table[vifid]; - vif_nest = nla_nest_start(skb, IPMRA_VIF); + vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) || @@ -2868,7 +2867,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) memset(hdr, 0, sizeof(*hdr)); hdr->ifi_family = RTNL_FAMILY_IPMR; - af = nla_nest_start(skb, IFLA_AF_SPEC); + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { nlmsg_cancel(skb, nlh); goto out; @@ -2879,7 +2878,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) goto out; } - vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS); + vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS); if (!vifs) { nla_nest_end(skb, af); nlmsg_end(skb, nlh); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 3e614cc824f7..ea48bd15a575 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -228,7 +228,7 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; - mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; @@ -335,8 +335,6 @@ next_entry2: } spin_unlock_bh(lock); err = 0; - e = 0; - out: cb->args[1] = e; return err; @@ -374,6 +372,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, err = mr_table_dump(mrt, skb, cb, fill, lock, filter); if (err < 0) break; + cb->args[1] = 0; next_table: t++; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c98391d49200..1412b029f37f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -27,14 +27,6 @@ config NF_TABLES_IPV4 if NF_TABLES_IPV4 -config NFT_CHAIN_ROUTE_IPV4 - tristate "IPv4 nf_tables route chain support" - help - This option enables the "route" chain for IPv4 in nf_tables. This - chain type is used to force packet re-routing after mangling header - fields such as the source, destination, type of service and - the packet mark. - config NFT_REJECT_IPV4 select NF_REJECT_IPV4 default NFT_REJECT @@ -232,16 +224,10 @@ if IP_NF_NAT config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE - default m if NETFILTER_ADVANCED=n + select NETFILTER_XT_TARGET_MASQUERADE help - Masquerading is a special case of NAT: all outgoing connections are - changed to seem to come from a particular interface's address, and - if the interface goes down, those connections are lost. This is - only useful for dialup accounts with dynamic IP address (ie. your IP - address will be different on next dialup). - - To compile it as a module, choose M here. If unsure, say N. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP_NF_TARGET_NETMAP tristate "NETMAP target support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index e241f5188ebe..c50e0ec095d2 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -24,7 +24,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o @@ -49,7 +48,6 @@ obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o # targets obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o -obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 835d50b279f5..a2a88ab07f7b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -56,7 +56,7 @@ struct clusterip_config { #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; /* for call_rcu_bh */ + struct rcu_head rcu; /* for call_rcu */ struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ }; diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 4e6b53ab6c33..7875c98072eb 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -631,4 +631,4 @@ module_exit(fini); MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); MODULE_DESCRIPTION("H.323 NAT helper"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_h323"); +MODULE_ALIAS_NF_NAT_HELPER("h323"); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 68b4d450391b..e17b4ee7604c 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -37,7 +37,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); -MODULE_ALIAS("ip_nat_pptp"); +MODULE_ALIAS_NF_NAT_HELPER("pptp"); static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c deleted file mode 100644 index 7d82934c46f4..000000000000 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv4.h> -#include <net/route.h> -#include <net/ip.h> - -static unsigned int nf_route_table_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - struct nft_pktinfo pkt; - u32 mark; - __be32 saddr, daddr; - u_int8_t tos; - const struct iphdr *iph; - int err; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - mark = skb->mark; - iph = ip_hdr(skb); - saddr = iph->saddr; - daddr = iph->daddr; - tos = iph->tos; - - ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_STOLEN) { - iph = ip_hdr(skb); - - if (iph->saddr != saddr || - iph->daddr != daddr || - skb->mark != mark || - iph->tos != tos) { - err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); - if (err < 0) - ret = NF_DROP_ERR(err); - } - } - return ret; -} - -static const struct nft_chain_type nft_chain_route_ipv4 = { - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, -}; - -static int __init nft_chain_route_init(void) -{ - nft_register_chain_type(&nft_chain_route_ipv4); - - return 0; -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv4); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "route"); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c55a5432cf37..dc91c27bb788 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -173,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { int sdif = inet_sdif(skb); + int dif = inet_iif(skb); struct sock *sk; struct hlist_head *head; int delivered = 0; @@ -185,8 +186,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, - iph->saddr, iph->daddr, - skb->dev->ifindex, sdif); + iph->saddr, iph->daddr, dif, sdif); while (sk) { delivered = 1; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a5da63e5faa2..11ddc276776e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -434,37 +434,46 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; - const struct rtable *rt; struct neighbour *n; - rt = (const struct rtable *) dst; - if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; - else if (skb) - pkey = &ip_hdr(skb)->daddr; + rcu_read_lock_bh(); + + if (likely(rt->rt_gw_family == AF_INET)) { + n = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + n = ip_neigh_gw6(dev, &rt->rt_gw6); + } else { + __be32 pkey; + + pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); + n = ip_neigh_gw4(dev, pkey); + } + + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + rcu_read_unlock_bh(); + + return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; - rt = (const struct rtable *)dst; - if (rt->rt_gateway) - pkey = (const __be32 *)&rt->rt_gateway; - else if (!daddr || + if (rt->rt_gw_family == AF_INET) { + pkey = (const __be32 *)&rt->rt_gw4; + } else if (rt->rt_gw_family == AF_INET6) { + return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); + } else if (!daddr || (rt->rt_flags & - (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; - + } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } @@ -500,15 +509,17 @@ EXPORT_SYMBOL(ip_idents_reserve); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { - static u32 ip_idents_hashrnd __read_mostly; u32 hash, id; - net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); + /* Note the following code is not safe, but this is okay. */ + if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) + get_random_bytes(&net->ipv4.ip_id_key, + sizeof(net->ipv4.ip_id_key)); - hash = jhash_3words((__force u32)iph->daddr, + hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, - iph->protocol ^ net_hash_mix(net), - ip_idents_hashrnd); + iph->protocol, + &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } @@ -627,13 +638,14 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; - rt->rt_uses_gateway = 1; + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = fnhe->fnhe_gw; } } -static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, - u32 pmtu, bool lock, unsigned long expires) +static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, + __be32 gw, u32 pmtu, bool lock, + unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; @@ -642,17 +654,17 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, unsigned int i; int depth; - genid = fnhe_genid(dev_net(nh->nh_dev)); + genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); - hash = rcu_dereference(nh->nh_exceptions); + hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - rcu_assign_pointer(nh->nh_exceptions, hash); + rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; @@ -704,13 +716,13 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, * stale, so anyone caching it rechecks if this exception * applies to them. */ - rt = rcu_dereference(nh->nh_rth_input); + rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; - prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -745,7 +757,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow return; } - if (rt->rt_gateway != old_gw) + if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); @@ -776,9 +788,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_common *nhc = FIB_RES_NHC(res); - update_or_create_fnhe(nh, fl4->daddr, new_gw, + update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } @@ -1025,9 +1037,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) rcu_read_lock(); if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh_common *nhc = FIB_RES_NHC(res); - update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); @@ -1176,18 +1188,46 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or - * DST_OBSOLETE_DEAD by dst_free(). + * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } +static void ipv4_send_dest_unreach(struct sk_buff *skb) +{ + struct ip_options opt; + int res; + + /* Recompile ip options since IPCB may not be valid anymore. + * Also check we have a reasonable ipv4 header. + */ + if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || + ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) + return; + + memset(&opt, 0, sizeof(opt)); + if (ip_hdr(skb)->ihl > 5) { + if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) + return; + opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + rcu_read_unlock(); + + if (res) + return; + } + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); +} + static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) @@ -1233,7 +1273,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) - src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), @@ -1276,7 +1316,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) + if (rt->rt_gw_family && mtu > 576) mtu = 576; } @@ -1285,7 +1325,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; @@ -1293,7 +1333,7 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_lock_bh(&fnhe_lock); - hash = rcu_dereference_protected(nh->nh_exceptions, + hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; @@ -1319,9 +1359,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } -static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, + __be32 daddr) { - struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); + struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; @@ -1335,7 +1376,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { - ip_del_fnhe(nh, daddr); + ip_del_fnhe(nhc, daddr); break; } return fnhe; @@ -1352,9 +1393,9 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { + struct fib_nh_common *nhc = res->nhc; + struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; - struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; - struct net_device *dev = nh->nh_dev; u32 mtu = 0; if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || @@ -1364,7 +1405,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) if (likely(!mtu)) { struct fib_nh_exception *fnhe; - fnhe = find_exception(nh, daddr); + fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } @@ -1372,7 +1413,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); + return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, @@ -1403,8 +1444,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, orig = NULL; } fill_route_from_fnhe(rt, fnhe); - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw4 = daddr; + rt->rt_gw_family = AF_INET; + } if (do_cache) { dst_hold(&rt->dst); @@ -1423,15 +1466,15 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, return ret; } -static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { - p = (struct rtable **)&nh->nh_rth_input; + p = (struct rtable **)&nhc->nhc_rth_input; } else { - p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); + p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; @@ -1527,30 +1570,42 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, bool cached = false; if (fi) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->nh_gw; - rt->rt_uses_gateway = 1; + if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_gw_family = nhc->nhc_gw_family; + /* only INET and INET6 are supported */ + if (likely(nhc->nhc_gw_family == AF_INET)) + rt->rt_gw4 = nhc->nhc_gw.ipv4; + else + rt->rt_gw6 = nhc->nhc_gw.ipv6; } + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID - rt->dst.tclassid = nh->nh_tclassid; + { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + rt->dst.tclassid = nh->nh_tclassid; + } #endif - rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) - cached = rt_cache_route(nh, rt); + cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = daddr; + } rt_add_uncached_list(rt); } } else @@ -1583,8 +1638,8 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; - rt->rt_gateway = 0; - rt->rt_uses_gateway = 0; + rt->rt_gw_family = 0; + rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1697,6 +1752,8 @@ static int __mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; @@ -1705,7 +1762,7 @@ static int __mkroute_input(struct sk_buff *skb, u32 itag = 0; /* get a working reference to the output device */ - out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; @@ -1722,10 +1779,14 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && - skb->protocol == htons(ETH_P_IP) && - (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) - IPCB(skb)->flags |= IPSKB_DOREDIRECT; + skb->protocol == htons(ETH_P_IP)) { + __be32 gw; + + gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + if (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, gw)) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1742,12 +1803,12 @@ static int __mkroute_input(struct sk_buff *skb, } } - fnhe = find_exception(&FIB_RES_NH(*res), daddr); + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2041,7 +2102,9 @@ local_input: do_cache = false; if (res->fi) { if (!itag) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -2071,16 +2134,16 @@ local_input: } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } - if (unlikely(!rt_cache_route(nh, rth))) + if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); @@ -2251,10 +2314,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; do_cache &= fi != NULL; if (fi) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; - struct fib_nh *nh = &FIB_RES_NH(*res); - fnhe = find_exception(nh, fl4->daddr); + fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { @@ -2262,12 +2325,12 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { + !(nhc->nhc_gw_family && + nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); + prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) @@ -2572,8 +2635,11 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_gateway = ort->rt_gateway; - rt->rt_uses_gateway = ort->rt_uses_gateway; + rt->rt_gw_family = ort->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + rt->rt_gw4 = ort->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2652,9 +2718,22 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_uses_gateway && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } expires = rt->dst.expires; if (expires) { @@ -2791,8 +2870,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || @@ -2810,8 +2889,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv4_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); if (err) return err; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e531344611a0..008545f63667 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -216,16 +216,15 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, refcount_set(&req->rsk_refcnt, 1); tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); - if (!inet_csk_reqsk_queue_add(sk, req, child)) { - bh_unlock_sock(child); - sock_put(child); - child = NULL; - reqsk_put(req); - } - } else { - reqsk_free(req); + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; + + bh_unlock_sock(child); + sock_put(child); } - return child; + __reqsk_free(req); + + return NULL; } EXPORT_SYMBOL(tcp_get_cookie_sock); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ba0fc4b18465..875867b64d6a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; +static int one_day_secs = 24 * 3600; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -549,6 +550,15 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "fib_sync_mem", + .data = &sysctl_fib_sync_mem, + .maxlen = sizeof(sysctl_fib_sync_mem), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &sysctl_fib_sync_mem_min, + .extra2 = &sysctl_fib_sync_mem_max, + }, { } }; @@ -1151,7 +1161,9 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6baa6dc1b13b..1fa15beb8380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,18 +457,6 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -void tcp_init_transfer(struct sock *sk, int bpf_op) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - tcp_mtup_init(sk); - icsk->icsk_af_ops->rebuild_header(sk); - tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op, 0, NULL); - tcp_init_congestion_control(sk); - tcp_init_buffer_space(sk); -} - static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); @@ -865,6 +853,18 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; + if (likely(!size)) { + skb = sk->sk_tx_skb_cache; + if (skb && !skb_cloned(skb)) { + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + sk->sk_tx_skb_cache = NULL; + pskb_trim(skb, 0); + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + skb_shinfo(skb)->tx_flags = 0; + memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); + return skb; + } + } /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); @@ -1098,30 +1098,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, } EXPORT_SYMBOL(tcp_sendpage); -/* Do not bother using a page frag for very small frames. - * But use this heuristic only for the first skb in write queue. - * - * Having no payload in skb->head allows better SACK shifting - * in tcp_shift_skb_data(), reducing sack/rack overhead, because - * write queue has less skbs. - * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB. - * This also speeds up tso_fragment(), since it wont fallback - * to tcp_fragment(). - */ -static int linear_payload_sz(bool first_skb) -{ - if (first_skb) - return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); - return 0; -} - -static int select_size(bool first_skb, bool zc) -{ - if (zc) - return 0; - return linear_payload_sz(first_skb); -} - void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { @@ -1272,7 +1248,6 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; - int linear; new_segment: if (!sk_stream_memory_free(sk)) @@ -1283,8 +1258,7 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(first_skb, zc); - skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; @@ -2552,6 +2526,11 @@ void tcp_write_queue_purge(struct sock *sk) sk_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); + skb = sk->sk_tx_skb_cache; + if (skb) { + __kfree_skb(skb); + sk->sk_tx_skb_cache = NULL; + } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); @@ -2587,6 +2566,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_rx_skb_cache) { + __kfree_skb(sk->sk_rx_skb_cache); + sk->sk_rx_skb_cache = NULL; + } tp->copied_seq = tp->rcv_nxt; tp->urg_data = 0; tcp_write_queue_purge(sk); diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index cd4814f7e962..477cb4aa456c 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -49,9 +49,8 @@ #define DCTCP_MAX_ALPHA 1024U struct dctcp { - u32 acked_bytes_ecn; - u32 acked_bytes_total; - u32 prior_snd_una; + u32 old_delivered; + u32 old_delivered_ce; u32 prior_rcv_nxt; u32 dctcp_alpha; u32 next_seq; @@ -67,19 +66,14 @@ static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; module_param(dctcp_alpha_on_init, uint, 0644); MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); -static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; -module_param(dctcp_clamp_alpha_on_loss, uint, 0644); -MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, - "parameter for clamping alpha on loss"); - static struct tcp_congestion_ops dctcp_reno; static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; - ca->acked_bytes_ecn = 0; - ca->acked_bytes_total = 0; + ca->old_delivered = tp->delivered; + ca->old_delivered_ce = tp->delivered_ce; } static void dctcp_init(struct sock *sk) @@ -91,7 +85,6 @@ static void dctcp_init(struct sock *sk) sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); - ca->prior_snd_una = tp->snd_una; ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); @@ -123,37 +116,25 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); - u32 acked_bytes = tp->snd_una - ca->prior_snd_una; - - /* If ack did not advance snd_una, count dupack as MSS size. - * If ack did update window, do not count it at all. - */ - if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) - acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; - if (acked_bytes) { - ca->acked_bytes_total += acked_bytes; - ca->prior_snd_una = tp->snd_una; - - if (flags & CA_ACK_ECE) - ca->acked_bytes_ecn += acked_bytes; - } /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { - u64 bytes_ecn = ca->acked_bytes_ecn; + u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); - if (bytes_ecn) { + if (delivered_ce) { + u32 delivered = tp->delivered - ca->old_delivered; + /* If dctcp_shift_g == 1, a 32bit value would overflow - * after 8 Mbytes. + * after 8 M packets. */ - bytes_ecn <<= (10 - dctcp_shift_g); - do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); + delivered_ce <<= (10 - dctcp_shift_g); + delivered_ce /= max(1U, delivered); - alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); + alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); } /* dctcp_alpha can be read from dctcp_get_info() without * synchro, so we ask compiler to not use dctcp_alpha @@ -164,21 +145,23 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) } } -static void dctcp_state(struct sock *sk, u8 new_state) +static void dctcp_react_to_loss(struct sock *sk) { - if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { - struct dctcp *ca = inet_csk_ca(sk); + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); - /* If this extension is enabled, we clamp dctcp_alpha to - * max on packet loss; the motivation is that dctcp_alpha - * is an indicator to the extend of congestion and packet - * loss is an indicator of extreme congestion; setting - * this in practice turned out to be beneficial, and - * effectively assumes total congestion which reduces the - * window by half. - */ - ca->dctcp_alpha = DCTCP_MAX_ALPHA; - } + ca->loss_cwnd = tp->snd_cwnd; + tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Recovery && + new_state != inet_csk(sk)->icsk_ca_state) + dctcp_react_to_loss(sk); + /* We handle RTO in dctcp_cwnd_event to ensure that we perform only + * one loss-adjustment per RTT. + */ } static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) @@ -190,6 +173,9 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; + case CA_EVENT_LOSS: + dctcp_react_to_loss(sk); + break; default: /* Don't care for the rest. */ break; @@ -200,6 +186,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Fill it also in case of VEGASINFO due to req struct limits. * We can still correctly retrieve it later. @@ -211,8 +198,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_alpha = ca->dctcp_alpha; - info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; - info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; + info->dctcp.dctcp_ab_ecn = tp->mss_cache * + (tp->delivered_ce - ca->old_delivered_ce); + info->dctcp.dctcp_ab_tot = tp->mss_cache * + (tp->delivered - ca->old_delivered); } *attr = INET_DIAG_DCTCPINFO; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5def3c48870e..20f6fac5882e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -77,7 +77,7 @@ #include <asm/unaligned.h> #include <linux/errqueue.h> #include <trace/events/tcp.h> -#include <linux/static_key.h> +#include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -113,22 +113,28 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ #if IS_ENABLED(CONFIG_TLS_DEVICE) -static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); +static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)) { icsk->icsk_clean_acked = cad; - static_branch_inc(&clean_acked_data_enabled); + static_branch_inc(&clean_acked_data_enabled.key); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); void clean_acked_data_disable(struct inet_connection_sock *icsk) { - static_branch_dec(&clean_acked_data_enabled); + static_branch_slow_dec_deferred(&clean_acked_data_enabled); icsk->icsk_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); + +void clean_acked_data_flush(void) +{ + static_key_deferred_flush(&clean_acked_data_enabled); +} +EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, @@ -402,11 +408,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + int room; + + room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_under_memory_pressure(sk)) { + if (room > 0 && !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -419,8 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, - tp->window_clamp); + tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -2252,7 +2258,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { - return !tp->retrans_stamp || + return tp->retrans_stamp && tcp_tsopt_ecr_before(tp, tp->retrans_stamp); } @@ -3521,7 +3527,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (rexmit == REXMIT_NONE) + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; if (unlikely(rexmit == 2)) { @@ -3598,7 +3604,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_retransmits = 0; #if IS_ENABLED(CONFIG_TLS_DEVICE) - if (static_branch_unlikely(&clean_acked_data_enabled)) + if (static_branch_unlikely(&clean_acked_data_enabled.key)) if (icsk->icsk_clean_acked) icsk->icsk_clean_acked(sk, ack); #endif @@ -5647,6 +5653,32 @@ discard: } EXPORT_SYMBOL(tcp_rcv_established); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + + /* Initialize the congestion window to start the transfer. + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been + * retransmitted. In light of RFC6298 more aggressive 1sec + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK + * retransmission has occurred. + */ + if (tp->total_retrans > 1 && tp->undo_marker) + tp->snd_cwnd = 1; + else + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); + tp->snd_cwnd_stamp = tcp_jiffies32; + + tcp_call_bpf(sk, bpf_op, 0, NULL); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -5748,6 +5780,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp) #endif } +static void tcp_try_undo_spurious_syn(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 syn_stamp; + + /* undo_marker is set when SYN or SYNACK times out. The timeout is + * spurious if the ACK's timestamp option echo value matches the + * original SYN timestamp. + */ + syn_stamp = tp->retrans_stamp; + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && + syn_stamp == tp->rx_opt.rcv_tsecr) + tp->undo_marker = 0; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) { @@ -5815,6 +5862,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_try_undo_spurious_syn(sk); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and @@ -5973,6 +6021,27 @@ reset_and_undo: return 1; } +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) +{ + tcp_try_undo_loss(sk, false); + inet_csk(sk)->icsk_retransmits = 0; + + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, + * we no longer need req so release it. + */ + reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); + + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); +} + /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -6069,22 +6138,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); - /* Once we leave TCP_SYN_RECV, we no longer need req - * so release it. - */ if (req) { - inet_csk(sk)->icsk_retransmits = 0; - reqsk_fastopen_remove(sk, req, false); - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); + tcp_rcv_synrecv_state_fastopen(sk); } else { + tcp_try_undo_spurious_syn(sk); + tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; } @@ -6119,16 +6177,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_FIN_WAIT1: { int tmo; - /* If we enter the TCP_FIN_WAIT1 state and we are a - * Fast Open socket and this is the first acceptable - * ACK we have received, this would have acknowledged - * our SYNACK so stop the SYNACK timer. - */ - if (req) { - /* We no longer need the request sock. */ - reqsk_fastopen_remove(sk, req, false); - tcp_rearm_rto(sk); - } + if (req) + tcp_rcv_synrecv_state_fastopen(sk); + if (tp->snd_una != tp->write_seq) break; @@ -6263,6 +6314,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * congestion control: Linux DCTCP asserts ECT on all packets, * including SYN, which is most optimal solution; however, * others, such as FreeBSD do not. + * + * Exception: At least one of the reserved bits of the TCP header (th->res1) is + * set, indicating the use of a future TCP extension (such as AccECN). See + * RFC8311 §4.3 which updates RFC3168 to allow the development of such + * extensions. */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, @@ -6282,7 +6338,7 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -6298,7 +6354,7 @@ static void tcp_openreq_init(struct request_sock *req, req->cookie_ts = 0; tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; - tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -6502,8 +6558,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); - reqsk_put(req); - goto drop; + goto drop_and_free; } sk->sk_data_ready(sk); bh_unlock_sock(fastopen_sk); @@ -6527,7 +6582,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, drop_and_release: dst_release(dst); drop_and_free: - reqsk_free(req); + __reqsk_free(req); drop: tcp_listendrop(sk); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 277d71239d75..af81e4a6a8d8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1673,7 +1673,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || ((TCP_SKB_CB(tail)->tcp_flags | - TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) || + !((TCP_SKB_CB(tail)->tcp_flags & + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || #ifdef CONFIG_TLS_DEVICE @@ -1692,6 +1694,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + /* We have to update both TCP_SKB_CB(tail)->tcp_flags and + * thtail->fin, so that the fast path in tcp_rcv_established() + * is not entered if we append a packet with a FIN. + * SYN, RST, URG are not present. + * ACK is set on both packets. + * PSH : we do not really care in TCP stack, + * at least for 'GRO' packets. + */ + thtail->fin |= th->fin; TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; if (TCP_SKB_CB(skb)->has_rxtstamp) { @@ -1774,6 +1785,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; @@ -1905,11 +1917,17 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { + skb_to_free = sk->sk_rx_skb_cache; + sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); - } else if (tcp_add_backlog(sk, skb)) { - goto discard_and_relse; + } else { + if (tcp_add_backlog(sk, skb)) + goto discard_and_relse; + skb_to_free = NULL; } bh_unlock_sock(sk); + if (skb_to_free) + __kfree_skb(skb_to_free); put_and_return: if (refcounted) @@ -2578,7 +2596,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; - module_put(net->ipv4.tcp_congestion_control->owner); + if (net->ipv4.tcp_congestion_control) + module_put(net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b467a7cabf40..c4848e7a0aad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -512,16 +512,6 @@ reset: inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been - * retransmitted. In light of RFC6298 more aggressive 1sec - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK - * retransmission has occurred. - */ - if (tp->total_retrans > 1) - tp->snd_cwnd = 1; - else - tp->snd_cwnd = tcp_init_cwnd(tp, dst); - tp->snd_cwnd_stamp = tcp_jiffies32; } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) @@ -658,7 +648,7 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, { int n = 0; - nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); + nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { @@ -951,14 +941,14 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) static const struct genl_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, - .policy = tcp_metrics_nl_policy, }, { .cmd = TCP_METRICS_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_del, - .policy = tcp_metrics_nl_policy, .flags = GENL_ADMIN_PERM, }, }; @@ -968,6 +958,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, .maxattr = TCP_METRICS_ATTR_MAX, + .policy = tcp_metrics_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tcp_metrics_nl_ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79900f783e0d..9c2a0d36fb20 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -522,6 +522,11 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } + if (req->num_timeout) { + newtp->undo_marker = treq->snt_isn; + newtp->retrans_stamp = div_u64(treq->snt_synack, + USEC_PER_SEC / TCP_TS_HZ); + } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4522579aaca2..0c4ed66dc1bf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -52,12 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); - if (val > tp->tcp_clock_cache) - tp->tcp_clock_cache = val; - - val = div_u64(val, NSEC_PER_USEC); - if (val > tp->tcp_mstamp) - tp->tcp_mstamp = val; + tp->tcp_clock_cache = val; + tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, @@ -3092,7 +3088,6 @@ void tcp_send_fin(struct sock *sk) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { -coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; @@ -3108,11 +3103,9 @@ coalesce: } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); - if (unlikely(!skb)) { - if (tskb) - goto coalesce; + if (unlikely(!skb)) return; - } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); @@ -3254,7 +3247,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb->skb_mstamp_ns = cookie_init_timestamp(req); else #endif + { skb->skb_mstamp_ns = tcp_clock_ns(); + if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ + tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); + } #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f0c86398e6a7..2ac23da42dd2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -393,6 +393,9 @@ static void tcp_fastopen_synack_timer(struct sock *sk) tcp_write_err(sk); return; } + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ + if (icsk->icsk_retransmits == 1) + tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 372fdc5381a9..3c58ba02af7d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1631,7 +1631,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) EXPORT_SYMBOL(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *peeked, int *off, int *err) + int noblock, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; @@ -1650,13 +1650,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, break; error = -EAGAIN; - *peeked = 0; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_destructor, - peeked, off, err, - &last); + off, err, &last); if (skb) { spin_unlock_bh(&queue->lock); return skb; @@ -1677,8 +1675,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, skb = __skb_try_recv_from_queue(sk, queue, flags, udp_skb_dtor_locked, - peeked, off, err, - &last); + off, err, &last); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) @@ -1713,8 +1710,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, peeking, off; - int err; + int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -1722,9 +1718,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = flags & MSG_PEEK; off = sk_peek_offset(sk, flags); - skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; @@ -1762,7 +1757,7 @@ try_again: } if (unlikely(err)) { - if (!peeked) { + if (!peeking) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); @@ -1771,7 +1766,7 @@ try_again: return err; } - if (!peeked) + if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 64f9715173ac..065334b41d57 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -352,6 +352,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; + unsigned int ulen; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -359,6 +360,12 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } + /* Do not deal with padded or malicious packets, sorry ! */ + ulen = ntohs(uh->len); + if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); @@ -377,13 +384,14 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot - * leading to execessive truesize values + * leading to excessive truesize values. + * On len mismatch merge the first packet shorter than gso_size, + * otherwise complete the GRO packet. */ - if (!skb_gro_receive(p, skb) && + if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || + ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; - else if (uh->len != uh2->len) - pp = p; return pp; } diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c deleted file mode 100644 index 856d2dfdb44b..000000000000 --- a/net/ipv4/xfrm4_mode_beet.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4. - * - * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com> - * Miika Komu <miika@iki.fi> - * Herbert Xu <herbert@gondor.apana.org.au> - * Abhinav Pathak <abhinav.pathak@hiit.fi> - * Jeff Ahrenholz <ahrenholz@gmail.com> - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dst.h> -#include <net/ip.h> -#include <net/xfrm.h> - -static void xfrm4_beet_make_header(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - - iph->ihl = 5; - iph->version = 4; - - iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; - iph->tos = XFRM_MODE_SKB_CB(skb)->tos; - - iph->id = XFRM_MODE_SKB_CB(skb)->id; - iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; - iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; -} - -/* Add encapsulation header. - * - * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. - */ -static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ip_beet_phdr *ph; - struct iphdr *top_iph; - int hdrlen, optlen; - - hdrlen = 0; - optlen = XFRM_MODE_SKB_CB(skb)->optlen; - if (unlikely(optlen)) - hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - - skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); - if (x->sel.family != AF_INET6) - skb->network_header += IPV4_BEET_PHMAXLEN; - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*top_iph); - - xfrm4_beet_make_header(skb); - - ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); - - top_iph = ip_hdr(skb); - - if (unlikely(optlen)) { - BUG_ON(optlen < 0); - - ph->padlen = 4 - (optlen & 4); - ph->hdrlen = optlen / 8; - ph->nexthdr = top_iph->protocol; - if (ph->padlen) - memset(ph + 1, IPOPT_NOP, ph->padlen); - - top_iph->protocol = IPPROTO_BEETPH; - top_iph->ihl = sizeof(struct iphdr) / 4; - } - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - - return 0; -} - -static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) -{ - struct iphdr *iph; - int optlen = 0; - int err = -EINVAL; - - if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { - struct ip_beet_phdr *ph; - int phlen; - - if (!pskb_may_pull(skb, sizeof(*ph))) - goto out; - - ph = (struct ip_beet_phdr *)skb->data; - - phlen = sizeof(*ph) + ph->padlen; - optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); - if (optlen < 0 || optlen & 3 || optlen > 250) - goto out; - - XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; - - if (!pskb_may_pull(skb, phlen)) - goto out; - __skb_pull(skb, phlen); - } - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - - xfrm4_beet_make_header(skb); - - iph = ip_hdr(skb); - - iph->ihl += optlen / 4; - iph->tot_len = htons(skb->len); - iph->daddr = x->sel.daddr.a4; - iph->saddr = x->sel.saddr.a4; - iph->check = 0; - iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); - err = 0; -out: - return err; -} - -static struct xfrm_mode xfrm4_beet_mode = { - .input2 = xfrm4_beet_input, - .input = xfrm_prepare_input, - .output2 = xfrm4_beet_output, - .output = xfrm4_prepare_output, - .owner = THIS_MODULE, - .encap = XFRM_MODE_BEET, - .flags = XFRM_MODE_FLAG_TUNNEL, -}; - -static int __init xfrm4_beet_init(void) -{ - return xfrm_register_mode(&xfrm4_beet_mode, AF_INET); -} - -static void __exit xfrm4_beet_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET); - BUG_ON(err); -} - -module_init(xfrm4_beet_init); -module_exit(xfrm4_beet_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c deleted file mode 100644 index 1ad2c2c4e250..000000000000 --- a/net/ipv4/xfrm4_mode_transport.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. - * - * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dst.h> -#include <net/ip.h> -#include <net/xfrm.h> -#include <net/protocol.h> - -/* Add encapsulation header. - * - * The IP header will be moved forward to make space for the encapsulation - * header. - */ -static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - int ihl = iph->ihl * 4; - - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - skb_set_network_header(skb, -x->props.header_len); - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + ihl; - __skb_pull(skb, ihl); - memmove(skb_network_header(skb), iph, ihl); - return 0; -} - -/* Remove encapsulation header. - * - * The IP header will be moved over the top of the encapsulation header. - * - * On entry, skb->h shall point to where the IP header should be and skb->nh - * shall be set to where the IP header currently is. skb->data shall point - * to the start of the payload. - */ -static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int ihl = skb->data - skb_transport_header(skb); - - if (skb->transport_header != skb->network_header) { - memmove(skb_transport_header(skb), - skb_network_header(skb), ihl); - skb->network_header = skb->transport_header; - } - ip_hdr(skb)->tot_len = htons(skb->len + ihl); - skb_reset_transport_header(skb); - return 0; -} - -static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - const struct net_offload *ops; - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct xfrm_offload *xo = xfrm_offload(skb); - - skb->transport_header += x->props.header_len; - ops = rcu_dereference(inet_offloads[xo->proto]); - if (likely(ops && ops->callbacks.gso_segment)) - segs = ops->callbacks.gso_segment(skb, features); - - return segs; -} - -static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len); - - if (xo->flags & XFRM_GSO_SEGMENT) { - skb_reset_transport_header(skb); - skb->transport_header -= x->props.header_len; - } -} - -static struct xfrm_mode xfrm4_transport_mode = { - .input = xfrm4_transport_input, - .output = xfrm4_transport_output, - .gso_segment = xfrm4_transport_gso_segment, - .xmit = xfrm4_transport_xmit, - .owner = THIS_MODULE, - .encap = XFRM_MODE_TRANSPORT, -}; - -static int __init xfrm4_transport_init(void) -{ - return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); -} - -static void __exit xfrm4_transport_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); - BUG_ON(err); -} - -module_init(xfrm4_transport_init); -module_exit(xfrm4_transport_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c deleted file mode 100644 index 2a9764bd1719..000000000000 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. - * - * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> - */ - -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dst.h> -#include <net/inet_ecn.h> -#include <net/ip.h> -#include <net/xfrm.h> - -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *inner_iph = ipip_hdr(skb); - - if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP_ECN_set_ce(inner_iph); -} - -/* Add encapsulation header. - * - * The top IP header will be constructed per RFC 2401. - */ -static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct iphdr *top_iph; - int flags; - - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - skb_set_network_header(skb, -x->props.header_len); - skb->mac_header = skb->network_header + - offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*top_iph); - top_iph = ip_hdr(skb); - - top_iph->ihl = 5; - top_iph->version = 4; - - top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); - - /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ - if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) - top_iph->tos = 0; - else - top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; - top_iph->tos = INET_ECN_encapsulate(top_iph->tos, - XFRM_MODE_SKB_CB(skb)->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - - top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - ip_select_ident(dev_net(dst->dev), skb, NULL); - - return 0; -} - -static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int err = -EINVAL; - - if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) - goto out; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto out; - - err = skb_unclone(skb, GFP_ATOMIC); - if (err) - goto out; - - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - if (skb->mac_len) - eth_hdr(skb)->h_proto = skb->protocol; - - err = 0; - -out: - return err; -} - -static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); -} - -static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - if (xo->flags & XFRM_GSO_SEGMENT) - skb->transport_header = skb->network_header + - sizeof(struct iphdr); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); -} - -static struct xfrm_mode xfrm4_tunnel_mode = { - .input2 = xfrm4_mode_tunnel_input, - .input = xfrm_prepare_input, - .output2 = xfrm4_mode_tunnel_output, - .output = xfrm4_prepare_output, - .gso_segment = xfrm4_mode_tunnel_gso_segment, - .xmit = xfrm4_mode_tunnel_xmit, - .owner = THIS_MODULE, - .encap = XFRM_MODE_TUNNEL, - .flags = XFRM_MODE_FLAG_TUNNEL, -}; - -static int __init xfrm4_mode_tunnel_init(void) -{ - return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); -} - -static void __exit xfrm4_mode_tunnel_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); - BUG_ON(err); -} - -module_init(xfrm4_mode_tunnel_init); -module_exit(xfrm4_mode_tunnel_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index be980c195fc5..9bb8905088c7 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -58,21 +58,6 @@ int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) return xfrm4_extract_header(skb); } -int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm_inner_extract_output(x, skb); - if (err) - return err; - - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - skb->protocol = htons(ETH_P_IP); - - return x->outer_mode->output2(x, skb); -} -EXPORT_SYMBOL(xfrm4_prepare_output); - int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -87,6 +72,8 @@ int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; + const struct xfrm_state_afinfo *afinfo; + int ret = -EAFNOSUPPORT; #ifdef CONFIG_NETFILTER if (!x) { @@ -95,7 +82,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) } #endif - return x->outer_mode->afinfo->output_finish(sk, skb); + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); + if (likely(afinfo)) + ret = afinfo->output_finish(sk, skb); + else + kfree_skb(skb); + rcu_read_unlock(); + + return ret; } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index d73a6d6652f6..cdef8f9a3b01 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -12,7 +12,6 @@ #include <linux/err.h> #include <linux/kernel.h> #include <linux/inetdevice.h> -#include <linux/if_tunnel.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> @@ -69,17 +68,6 @@ static int xfrm4_get_saddr(struct net *net, int oif, return 0; } -static int xfrm4_get_tos(const struct flowi *fl) -{ - return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ -} - -static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, - int nfheader_len) -{ - return 0; -} - static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { @@ -97,8 +85,11 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; - xdst->u.rt.rt_gateway = rt->rt_gateway; - xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; + xdst->u.rt.rt_gw_family = rt->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + xdst->u.rt.rt_gw4 = rt->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + xdst->u.rt.rt_gw6 = rt->rt_gw6; xdst->u.rt.rt_pmtu = rt->rt_pmtu; xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); @@ -107,118 +98,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return 0; } -static void -_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) -{ - const struct iphdr *iph = ip_hdr(skb); - u8 *xprth = skb_network_header(skb) + iph->ihl * 4; - struct flowi4 *fl4 = &fl->u.ip4; - int oif = 0; - - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; - - memset(fl4, 0, sizeof(struct flowi4)); - fl4->flowi4_mark = skb->mark; - fl4->flowi4_oif = reverse ? skb->skb_iif : oif; - - if (!ip_is_fragment(iph)) { - switch (iph->protocol) { - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_TCP: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ports; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ports = (__be16 *)xprth; - - fl4->fl4_sport = ports[!!reverse]; - fl4->fl4_dport = ports[!reverse]; - } - break; - - case IPPROTO_ICMP: - if (xprth + 2 < skb->data || - pskb_may_pull(skb, xprth + 2 - skb->data)) { - u8 *icmp; - - xprth = skb_network_header(skb) + iph->ihl * 4; - icmp = xprth; - - fl4->fl4_icmp_type = icmp[0]; - fl4->fl4_icmp_code = icmp[1]; - } - break; - - case IPPROTO_ESP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ehdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ehdr[0]; - } - break; - - case IPPROTO_AH: - if (xprth + 8 < skb->data || - pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ah_hdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ah_hdr[1]; - } - break; - - case IPPROTO_COMP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - ipcomp_hdr = (__be16 *)xprth; - - fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); - } - break; - - case IPPROTO_GRE: - if (xprth + 12 < skb->data || - pskb_may_pull(skb, xprth + 12 - skb->data)) { - __be16 *greflags; - __be32 *gre_hdr; - - xprth = skb_network_header(skb) + iph->ihl * 4; - greflags = (__be16 *)xprth; - gre_hdr = (__be32 *)xprth; - - if (greflags[0] & GRE_KEY) { - if (greflags[0] & GRE_CSUM) - gre_hdr++; - fl4->fl4_gre_key = gre_hdr[1]; - } - } - break; - - default: - fl4->fl4_ipsec_spi = 0; - break; - } - } - fl4->flowi4_proto = iph->protocol; - fl4->daddr = reverse ? iph->saddr : iph->daddr; - fl4->saddr = reverse ? iph->daddr : iph->saddr; - fl4->flowi4_tos = iph->tos; -} - static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -271,9 +150,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, - .decode_session = _decode_session4, - .get_tos = xfrm4_get_tos, - .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, .blackhole_route = ipv4_blackhole_route, }; diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 35c54865dc42..bcab48944c15 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -46,7 +46,7 @@ static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) handler != NULL; \ handler = rcu_dereference(handler->next)) \ -int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm4_protocol *handler; @@ -61,7 +61,6 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } -EXPORT_SYMBOL(xfrm4_rcv_cb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 613282c65a10..cd915e332c98 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -135,44 +135,11 @@ config INET6_TUNNEL tristate default n -config INET6_XFRM_MODE_TRANSPORT - tristate "IPv6: IPsec transport mode" - default IPV6 - select XFRM - ---help--- - Support for IPsec transport mode. - - If unsure, say Y. - -config INET6_XFRM_MODE_TUNNEL - tristate "IPv6: IPsec tunnel mode" - default IPV6 - select XFRM - ---help--- - Support for IPsec tunnel mode. - - If unsure, say Y. - -config INET6_XFRM_MODE_BEET - tristate "IPv6: IPsec BEET mode" - default IPV6 - select XFRM - ---help--- - Support for IPsec BEET mode. - - If unsure, say Y. - -config INET6_XFRM_MODE_ROUTEOPTIMIZATION - tristate "IPv6: MIPv6 route optimization mode" - select XFRM - ---help--- - Support for MIPv6 route optimization mode. - config IPV6_VTI tristate "Virtual (secure) IPv6: tunneling" select IPV6_TUNNEL select NET_IP_TUNNEL - depends on INET6_XFRM_MODE_TUNNEL + select XFRM ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index e0026fa1261b..8ccf35514015 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -35,10 +35,6 @@ obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o -obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o -obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o -obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o -obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_IPV6_ILA) += ila/ obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4ae17a966ae3..f96d1de79509 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -173,7 +173,8 @@ static int addrconf_ifdown(struct net_device *dev, int how); static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, - u32 flags, u32 noflags); + u32 flags, u32 noflags, + bool no_gw); static void addrconf_dad_start(struct inet6_ifaddr *ifp); static void addrconf_dad_work(struct work_struct *w); @@ -610,11 +611,13 @@ static int inet6_netconf_valid_get_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv6_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv6_policy, extack); - err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_ipv6_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_ipv6_policy, extack); if (err) return err; @@ -1230,10 +1233,8 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r { struct fib6_info *f6i; - f6i = addrconf_get_prefix_route(&ifp->addr, - ifp->prefix_len, - ifp->idev->dev, - 0, RTF_GATEWAY | RTF_DEFAULT); + f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) ip6_del_rt(dev_net(ifp->idev->dev), f6i); @@ -2402,7 +2403,8 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric, static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, - u32 flags, u32 noflags) + u32 flags, u32 noflags, + bool no_gw) { struct fib6_node *fn; struct fib6_info *rt = NULL; @@ -2419,7 +2421,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) + if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh.fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -2717,7 +2721,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) pinfo->prefix_len, dev, RTF_ADDRCONF | RTF_PREFIX_RT, - RTF_GATEWAY | RTF_DEFAULT); + RTF_DEFAULT, true); if (rt) { /* Autoconf prefix route */ @@ -4563,8 +4567,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, u32 ifa_flags; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); if (err < 0) return err; @@ -4588,10 +4592,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, struct fib6_info *f6i; u32 prio; - f6i = addrconf_get_prefix_route(&ifp->addr, - ifp->prefix_len, - ifp->idev->dev, - 0, RTF_GATEWAY | RTF_DEFAULT); + f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->idev->dev, 0, RTF_DEFAULT, true); if (!f6i) return -ENOENT; @@ -4729,8 +4731,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct ifa6_config cfg; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); if (err < 0) return err; @@ -5086,8 +5088,8 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, fillargs->flags |= NLM_F_DUMP_FILTERED; } - err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv6_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); if (err < 0) return err; @@ -5237,11 +5239,11 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv6_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); - err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, - ifa_ipv6_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); if (err) return err; @@ -5667,8 +5669,8 @@ static int inet6_validate_link_af(const struct net_device *dev, if (dev && !__in6_dev_get(dev)) return -EAFNOSUPPORT; - return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy, - NULL); + return nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, + inet6_af_policy, NULL); } static int check_addr_gen_mode(int mode) @@ -5700,7 +5702,7 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!idev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET6_TOKEN]) { @@ -5752,7 +5754,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) goto nla_put_failure; - protoinfo = nla_nest_start(skb, IFLA_PROTINFO); + protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO); if (!protoinfo) goto nla_put_failure; @@ -5972,7 +5974,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, - ifp->idev->dev, 0, 0); + ifp->idev->dev, 0, 0, + false); if (rt) ip6_del_rt(net, rt); } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 6c79af056d9b..763a947e0d14 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -5,7 +5,7 @@ #include <linux/export.h> #include <net/ipv6.h> -#include <net/addrconf.h> +#include <net/ipv6_stubs.h> #include <net/ip.h> /* if ipv6 module registers this function is used by xfrm to force all @@ -144,43 +144,53 @@ static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) return NULL; } -static struct fib6_info * +static int eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - return NULL; + return -EAFNOSUPPORT; } -static struct fib6_info * +static int eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) + struct fib6_result *res, int flags) { - return NULL; + return -EAFNOSUPPORT; } -static struct fib6_info * -eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, int strict) +static void +eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict) { - return f6i; } static u32 -eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr) +eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { return 0; } +static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, - .fib6_multipath_select = eafnosupport_fib6_multipath_select, + .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, + .fib6_nh_init = eafnosupport_fib6_nh_init, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index d43d076c98f5..642fc6ac13d2 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -383,8 +383,8 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, u32 label; int err = 0; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, + ifal_policy, extack); if (err < 0) return err; @@ -476,7 +476,7 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, } if (nlmsg_attrlen(nlh, sizeof(*ifal))) { - NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst"); + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request"); return -EINVAL; } @@ -537,8 +537,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, - ifal_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, + IFAL_MAX, ifal_policy, extack); ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { @@ -546,8 +546,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*ifal), tb, IFAL_MAX, - ifal_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifal), tb, IFAL_MAX, + ifal_policy, extack); if (err) return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2f45d2a3e3a3..c04ae282f4e4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -56,6 +56,7 @@ #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/ipv6_stubs.h> #include <net/ndisc.h> #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> @@ -546,12 +547,6 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct net *net = sock_net(sk); switch (cmd) { - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - case SIOCADDRT: case SIOCDELRT: @@ -584,6 +579,7 @@ const struct proto_ops inet6_stream_ops = { .getname = inet6_getname, .poll = tcp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = inet_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ @@ -617,6 +613,7 @@ const struct proto_ops inet6_dgram_ops = { .getname = inet6_getname, .poll = udp_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ @@ -847,6 +844,17 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; + net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; + net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; + + /* By default, rate limit error messages. + * Except for pmtu discovery, it would break it. + * proc_do_large_bitmap needs pointer to the bitmap. + */ + bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1); + bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1); + net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask; + net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; @@ -914,8 +922,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, - .fib6_multipath_select = fib6_multipath_select, + .fib6_select_path = fib6_select_path, .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, + .fib6_nh_init = fib6_nh_init, + .fib6_nh_release = fib6_nh_release, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d46b4eb645c2..d453cf417b03 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -74,13 +74,13 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, goto out; if (sp->len == XFRM_MAX_DEPTH) - goto out; + goto out_reset; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET6); if (!x) - goto out; + goto out_reset; sp->xvec[sp->len++] = x; sp->olen++; @@ -88,7 +88,7 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo) { xfrm_state_put(x); - goto out; + goto out_reset; } } @@ -109,6 +109,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xfrm_input(skb, IPPROTO_ESP, spi, -2); return ERR_PTR(-EINPROGRESS); +out_reset: + secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; @@ -134,6 +136,44 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) xo->proto = proto; } +static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); +} + +static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet6_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + switch (x->outer_mode.encap) { + case XFRM_MODE_TUNNEL: + return xfrm6_tunnel_gso_segment(x, skb, features); + case XFRM_MODE_TRANSPORT: + return xfrm6_transport_gso_segment(x, skb, features); + } + + return ERR_PTR(-EOPNOTSUPP); +} + static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -172,7 +212,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, xo->flags |= XFRM_GSO_SEGMENT; - return x->outer_mode->gso_segment(x, skb, esp_features); + return xfrm6_outer_mode_gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f590446595d8..06d1b7763600 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,16 +61,16 @@ unsigned int fib6_rules_seq_read(struct net *net) } /* called with rcu lock held; no reference taken on fib6_info */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - struct fib6_info *f6i; int err; if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = fib6_table_lookup, .lookup_data = &oif, + .result = res, .flags = FIB_LOOKUP_NOREF, }; @@ -78,19 +78,15 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, err = fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); - if (err) - return ERR_PTR(err); - - f6i = arg.result ? : net->ipv6.fib6_null_entry; } else { - f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, - oif, fl6, flags); - if (!f6i || f6i == net->ipv6.fib6_null_entry) - f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, - oif, fl6, flags); + err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif, + fl6, res, flags); + if (err || res->f6i == net->ipv6.fib6_null_entry) + err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, + oif, fl6, res, flags); } - return f6i; + return err; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, @@ -98,9 +94,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { + struct fib6_result res = {}; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .lookup_data = skb, + .result = &res, .flags = FIB_LOOKUP_NOREF, }; @@ -110,8 +108,8 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); - if (arg.result) - return arg.result; + if (res.rt6) + return &res.rt6->dst; } else { struct rt6_info *rt; @@ -157,11 +155,11 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct net *net = rule->fr_net; struct fib6_table *table; - struct fib6_info *f6i; - int err = -EAGAIN, *oif; + int err, *oif; u32 tb_id; switch (rule->action) { @@ -182,14 +180,12 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, return -EAGAIN; oif = (int *)arg->lookup_data; - f6i = fib6_table_lookup(net, table, *oif, flp6, flags); - if (f6i != net->ipv6.fib6_null_entry) { + err = fib6_table_lookup(net, table, *oif, flp6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) err = fib6_rule_saddr(net, rule, flags, flp6, - fib6_info_nh_dev(f6i)); - - if (likely(!err)) - arg->result = f6i; - } + res->nh->fib_nh_dev); + else + err = -EAGAIN; return err; } @@ -197,6 +193,7 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { + struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; @@ -251,7 +248,7 @@ again: discard_pkt: dst_hold(&rt->dst); out: - arg->result = rt; + res->rt6 = rt; return err; } @@ -266,9 +263,13 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { - struct rt6_info *rt = (struct rt6_info *) arg->result; + struct fib6_result *res = arg->result; + struct rt6_info *rt = res->rt6; struct net_device *dev = NULL; + if (!rt) + return false; + if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 802faa2fcc0e..afb915807cd0 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -168,22 +168,21 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } -static bool icmpv6_mask_allow(int type) +static bool icmpv6_mask_allow(struct net *net, int type) { - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) + if (type > ICMPV6_MSG_MAX) return true; - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + /* Limit if icmp type is set in ratemask. */ + if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } -static bool icmpv6_global_allow(int type) +static bool icmpv6_global_allow(struct net *net, int type) { - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow()) @@ -202,7 +201,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(type)) + if (icmpv6_mask_allow(net, type)) return true; /* @@ -511,7 +510,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; mip6_addr_swap(skb); @@ -683,12 +682,20 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); + bool acast; + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && + net->ipv6.sysctl.icmpv6_echo_ignore_multicast) + return; saddr = &ipv6_hdr(skb)->daddr; + acast = ipv6_anycast_destination(skb_dst(skb), saddr); + if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) + return; + if (!ipv6_unicast_destination(skb) && - !(net->ipv6.sysctl.anycast_src_echo_reply && - ipv6_anycast_destination(skb_dst(skb), saddr))) + !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); @@ -723,6 +730,11 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; + /* Check the ratelimit */ + if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + goto out_dst_release; + idev = __in6_dev_get(skb->dev); msg.skb = skb; @@ -743,6 +755,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); } +out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); @@ -1115,6 +1128,27 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "echo_ignore_multicast", + .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "echo_ignore_anycast", + .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ratemask", + .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, + .maxlen = ICMPV6_MSG_MAX + 1, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { }, }; @@ -1129,6 +1163,9 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; + table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; + table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; + table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; } return table; } diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 3d56a2fb6f86..422dcc691f71 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -146,7 +146,8 @@ static int ila_build_state(struct nlattr *nla, if (family != AF_INET6) return -EINVAL; - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); + ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla, + ila_nl_policy, extack); if (ret < 0) return ret; diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 18fac76b9520..257d2b681246 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -16,29 +16,29 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { static const struct genl_ops ila_nl_ops[] = { { .cmd = ILA_CMD_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_add_mapping, - .policy = ila_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_del_mapping, - .policy = ila_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_flush, - .policy = ila_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_get_mapping, .start = ila_xlat_nl_dump_start, .dumpit = ila_xlat_nl_dump, .done = ila_xlat_nl_dump_done, - .policy = ila_nl_policy, }, }; @@ -49,6 +49,7 @@ struct genl_family ila_nl_family __ro_after_init = { .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, .maxattr = ILA_ATTR_MAX, + .policy = ila_nl_policy, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 79d2e43c05c5..5fc1f4e0c0cf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -417,6 +417,7 @@ int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) done: rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); return ret; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6613d8dbb0e5..08e0390e001c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -162,7 +162,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) } INIT_LIST_HEAD(&f6i->fib6_siblings); - atomic_inc(&f6i->fib6_ref); + refcount_set(&f6i->fib6_ref, 1); return f6i; } @@ -175,10 +175,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) WARN_ON(f6i->fib6_node); bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - if (bucket) { - f6i->rt6i_exception_bucket = NULL; - kfree(bucket); - } + kfree(bucket); if (f6i->rt6i_pcpu) { int cpu; @@ -199,10 +196,7 @@ void fib6_info_destroy_rcu(struct rcu_head *head) free_percpu(f6i->rt6i_pcpu); } - lwtstate_put(f6i->fib6_nh.nh_lwtstate); - - if (f6i->fib6_nh.nh_dev) - dev_put(f6i->fib6_nh.nh_dev); + fib6_nh_release(&f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); @@ -357,10 +351,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } /* called with rcu lock held; no reference taken on fib6_info */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags) +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) { - return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags); + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, + res, flags); } static void __net_init fib6_tables_init(struct net *net) @@ -851,8 +846,8 @@ insert_above: RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&rcu_dereference_protected(in->leaf, - lockdep_is_held(&table->tb6_lock))->fib6_ref); + fib6_info_hold(rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) @@ -921,9 +916,7 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, if (pcpu_rt) { struct fib6_info *from; - from = rcu_dereference_protected(pcpu_rt->from, - lockdep_is_held(&table->tb6_lock)); - rcu_assign_pointer(pcpu_rt->from, NULL); + from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } @@ -934,7 +927,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; - if (atomic_read(&rt->fib6_ref) != 1) { + if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes @@ -947,7 +940,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); - atomic_inc(&new_leaf->fib6_ref); + fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); @@ -1113,7 +1106,7 @@ add: return err; rcu_assign_pointer(rt->fib6_next, iter); - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) @@ -1141,7 +1134,7 @@ add: if (err) return err; - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); @@ -1283,7 +1276,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref); + fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; @@ -1326,7 +1319,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { - atomic_inc(&rt->fib6_ref); + fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } @@ -2297,6 +2290,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + unsigned int flags = rt->fib6_flags; const struct net_device *dev; seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); @@ -2306,15 +2300,17 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_flags & RTF_GATEWAY) - seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); - else + if (rt->fib6_nh.fib_nh_gw_family) { + flags |= RTF_GATEWAY; + seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + } else { seq_puts(seq, "00000000000000000000000000000000"); + } - dev = rt->fib6_nh.nh_dev; + dev = rt->fib6_nh.fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->fib6_metric, atomic_read(&rt->fib6_ref), 0, - rt->fib6_flags, dev ? dev->name : ""); + rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, + flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index cb54a8a3c273..be5f3d7ceb96 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -94,15 +94,21 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static void fl_free_rcu(struct rcu_head *head) +{ + struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); + + if (fl->share == IPV6_FL_S_PROCESS) + put_pid(fl->owner.pid); + kfree(fl->opt); + kfree(fl); +} + static void fl_free(struct ip6_flowlabel *fl) { - if (fl) { - if (fl->share == IPV6_FL_S_PROCESS) - put_pid(fl->owner.pid); - kfree(fl->opt); - kfree_rcu(fl, rcu); - } + if (fl) + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) @@ -633,9 +639,9 @@ recheck: if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && - (fl1->owner.pid == fl->owner.pid)) || + (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && - uid_eq(fl1->owner.uid, fl->owner.uid))) + !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b32c95f02128..655e46b227f9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -525,10 +525,10 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) } static int ip6erspan_rcv(struct sk_buff *skb, - struct tnl_ptk_info *tpi) + struct tnl_ptk_info *tpi, + int gre_hdr_len) { struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; const struct ipv6hdr *ipv6h; struct erspan_md2 *md2; struct ip6_tnl *tunnel; @@ -547,18 +547,16 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; - ershdr = (struct erspan_base_hdr *)skb->data; - pkt_md = (struct erspan_metadata *)(ershdr + 1); - if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) return PACKET_REJECT; if (tunnel->parms.collect_md) { + struct erspan_metadata *pkt_md, *md; struct metadata_dst *tun_dst; struct ip_tunnel_info *info; - struct erspan_metadata *md; + unsigned char *gh; __be64 tun_id; __be16 flags; @@ -571,6 +569,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (!tun_dst) return PACKET_REJECT; + /* skb can be uncloned in __iptunnel_pull_header, so + * old pkt_md is no longer valid and we need to reset + * it + */ + gh = skb_network_header(skb) + + skb_network_header_len(skb); + pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + + sizeof(*ershdr)); info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); md->version = ver; @@ -607,7 +613,7 @@ static int gre_rcv(struct sk_buff *skb) if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { - if (ip6erspan_rcv(skb, &tpi) == PACKET_RCVD) + if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index c7ed2b6d5a1d..b50b1af1f530 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -29,6 +29,7 @@ #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/slab.h> +#include <linux/indirect_call_wrapper.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -47,6 +48,8 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> +INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *)); static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -57,7 +60,8 @@ static void ip6_rcv_finish_core(struct net *net, struct sock *sk, ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) - edemux(skb); + INDIRECT_CALL_2(edemux, tcp_v6_early_demux, + udp_v6_early_demux, skb); } if (!skb_valid_dst(skb)) ip6_route_input(skb); @@ -316,6 +320,9 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, ip6_sublist_rcv(&sublist, curr_dev, curr_net); } +INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); + /* * Deliver the packet to the host */ @@ -391,7 +398,8 @@ resubmit_final: !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - ret = ipprot->handler(skb); + ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv, + skb); if (ret > 0) { if (ipprot->flags & INET6_PROTO_FINAL) { /* Not an extension header, most likely UDP diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index edbd12067170..adef2236abe2 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -117,7 +117,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); - ret = neigh_output(neigh, skb); + ret = neigh_output(neigh, skb, false); rcu_read_unlock_bh(); return ret; } @@ -601,7 +601,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; - unsigned int mtu, hlen, left, len; + unsigned int mtu, hlen, left, len, nexthdr_offset; int hroom, troom; __be32 frag_id; int ptr, offset = 0, err = 0; @@ -612,6 +612,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, goto fail; hlen = err; nexthdr = *prevhdr; + nexthdr_offset = prevhdr - skb_network_header(skb); mtu = ip6_skb_dst_mtu(skb); @@ -646,6 +647,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, (err = skb_checksum_help(skb))) goto fail; + prevhdr = skb_network_header(skb) + nexthdr_offset; hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0c6403cf8b52..ade1390c6348 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -627,7 +627,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); - if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL) { + if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; @@ -636,7 +636,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || - skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) + skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 8b6eefff2f7e..218a0dedc8f4 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -342,7 +342,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) struct net_device *dev; struct pcpu_sw_netstats *tstats; struct xfrm_state *x; - struct xfrm_mode *inner_mode; + const struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; @@ -361,7 +361,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) x = xfrm_input_state(skb); - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); @@ -372,7 +372,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) } } - family = inner_mode->afinfo->family; + family = inner_mode->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e4dd57976737..4e69847ed5be 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -355,7 +355,6 @@ static const struct rhashtable_params ip6mr_rht_params = { .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, - .locks_mul = 1, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 659ecf4e4b3c..4c8e2ea8bf19 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -77,6 +77,8 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); +static bool ndisc_allow_add(const struct net_device *dev, + struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -117,6 +119,7 @@ struct neigh_table nd_tbl = { .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, + .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, @@ -392,6 +395,20 @@ static void pndisc_destructor(struct pneigh_entry *n) ipv6_dev_mc_dec(dev, &maddr); } +/* called with rtnl held */ +static bool ndisc_allow_add(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev || idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); + return false; + } + + return true; +} + static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { @@ -1276,8 +1293,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, - rt->fib6_nh.nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, + rt->fib6_nh.fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1306,8 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, - rt->fib6_nh.nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, + rt->fib6_nh.fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ddc99a1653aa..086fc669279e 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -23,14 +23,6 @@ config NF_TABLES_IPV6 if NF_TABLES_IPV6 -config NFT_CHAIN_ROUTE_IPV6 - tristate "IPv6 nf_tables route chain support" - help - This option enables the "route" chain for IPv6 in nf_tables. This - chain type is used to force packet re-routing after mangling header - fields such as the source, destination, flowlabel, hop-limit and - the packet mark. - config NFT_REJECT_IPV6 select NF_REJECT_IPV6 default NFT_REJECT @@ -278,15 +270,10 @@ if IP6_NF_NAT config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" - select NF_NAT_MASQUERADE + select NETFILTER_XT_TARGET_MASQUERADE help - Masquerading is a special case of NAT: all outgoing connections are - changed to seem to come from a particular interface's address, and - if the interface goes down, those connections are lost. This is - only useful for dialup accounts with dynamic IP address (ie. your IP - address will be different on next dialup). - - To compile it as a module, choose M here. If unsure, say N. + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP6_NF_TARGET_NPT tristate "NPT (Network Prefix translation) target support" diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 3853c648ebaa..731a74c60dca 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,7 +27,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o # nf_tables -obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o @@ -47,7 +46,6 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o # targets -obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c deleted file mode 100644 index 29c7f1915a96..000000000000 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6 - * NAT funded by Astaro. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/ipv6.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat.h> -#include <net/addrconf.h> -#include <net/ipv6.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> - -static unsigned int -masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) -{ - return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); -} - -static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) -{ - const struct nf_nat_range2 *range = par->targinfo; - - if (range->flags & NF_NAT_RANGE_MAP_IPS) - return -EINVAL; - return nf_ct_netns_get(par->net, par->family); -} - -static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par) -{ - nf_ct_netns_put(par->net, par->family); -} - -static struct xt_target masquerade_tg6_reg __read_mostly = { - .name = "MASQUERADE", - .family = NFPROTO_IPV6, - .checkentry = masquerade_tg6_checkentry, - .destroy = masquerade_tg6_destroy, - .target = masquerade_tg6, - .targetsize = sizeof(struct nf_nat_range), - .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, - .me = THIS_MODULE, -}; - -static int __init masquerade_tg6_init(void) -{ - int err; - - err = xt_register_target(&masquerade_tg6_reg); - if (err) - return err; - - err = nf_nat_masquerade_ipv6_register_notifier(); - if (err) - xt_unregister_target(&masquerade_tg6_reg); - - return err; -} -static void __exit masquerade_tg6_exit(void) -{ - nf_nat_masquerade_ipv6_unregister_notifier(); - xt_unregister_target(&masquerade_tg6_reg); -} - -module_init(masquerade_tg6_init); -module_exit(masquerade_tg6_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: automatic address SNAT"); diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c deleted file mode 100644 index da3f1f8cb325..000000000000 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> - * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_ipv6.h> -#include <net/route.h> - -static unsigned int nf_route_table_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - unsigned int ret; - struct nft_pktinfo pkt; - struct in6_addr saddr, daddr; - u_int8_t hop_limit; - u32 mark, flowlabel; - int err; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); - - /* save source/dest address, mark, hoplimit, flowlabel, priority */ - memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); - memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); - mark = skb->mark; - hop_limit = ipv6_hdr(skb)->hop_limit; - - /* flowlabel and prio (includes version, which shouldn't change either */ - flowlabel = *((u32 *)ipv6_hdr(skb)); - - ret = nft_do_chain(&pkt, priv); - if (ret != NF_DROP && ret != NF_STOLEN && - (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || - memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || - skb->mark != mark || - ipv6_hdr(skb)->hop_limit != hop_limit || - flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { - err = ip6_route_me_harder(state->net, skb); - if (err < 0) - ret = NF_DROP_ERR(err); - } - - return ret; -} - -static const struct nft_chain_type nft_chain_route_ipv6 = { - .name = "route", - .type = NFT_CHAIN_T_ROUTE, - .family = NFPROTO_IPV6, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_OUT), - .hooks = { - [NF_INET_LOCAL_OUT] = nf_route_table_hook, - }, -}; - -static int __init nft_chain_route_init(void) -{ - nft_register_chain_type(&nft_chain_route_ipv6); - - return 0; -} - -static void __exit nft_chain_route_exit(void) -{ - nft_unregister_chain_type(&nft_chain_route_ipv6); -} - -module_init(nft_chain_route_init); -module_exit(nft_chain_route_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route"); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 4fe7c90962dd..868ae23dbae1 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -10,15 +10,25 @@ #include <net/secure_seq.h> #include <linux/netfilter.h> -static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, +static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { + const struct { + struct in6_addr dst; + struct in6_addr src; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .dst = *dst, + .src = *src, + }; u32 hash, id; - hash = __ipv6_addr_jhash(dst, hashrnd); - hash = __ipv6_addr_jhash(src, hash); - hash ^= net_hash_mix(net); + /* Note the following code is not safe, but this is okay. */ + if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) + get_random_bytes(&net->ipv4.ip_id_key, + sizeof(net->ipv4.ip_id_key)); + + hash = siphash(&combined, sizeof(combined), &net->ipv4.ip_id_key); /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, * set the hight order instead thus minimizing possible future @@ -41,7 +51,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, */ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { - static u32 ip6_proxy_idents_hashrnd __read_mostly; struct in6_addr buf[2]; struct in6_addr *addrs; u32 id; @@ -53,11 +62,7 @@ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) if (!addrs) return 0; - net_get_random_once(&ip6_proxy_idents_hashrnd, - sizeof(ip6_proxy_idents_hashrnd)); - - id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, - &addrs[1], &addrs[0]); + id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); return htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); @@ -66,12 +71,9 @@ __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) { - static u32 ip6_idents_hashrnd __read_mostly; u32 id; - net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - - id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); + id = __ipv6_select_ident(net, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5a426226c762..84dbe21b71e5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1356,6 +1356,7 @@ const struct proto_ops inet6_sockraw_ops = { .getname = inet6_getname, .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ + .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0302e0eb07af..23a20d62daac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -59,7 +59,7 @@ #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> @@ -102,14 +102,15 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict); static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr); @@ -295,7 +296,7 @@ static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, - .fib6_ref = ATOMIC_INIT(1), + .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; @@ -379,11 +380,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - rcu_read_lock(); - from = rcu_dereference(rt->from); - rcu_assign_pointer(rt->from, NULL); + from = xchg((__force struct fib6_info **)&rt->from, NULL); fib6_info_release(from); - rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -427,13 +425,15 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -struct fib6_info *fib6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +void fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict) { struct fib6_info *sibling, *next_sibling; + struct fib6_info *match = res->f6i; + + if (!match->fib6_nsiblings || have_oif_match) + goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -441,61 +441,89 @@ struct fib6_info *fib6_multipath_select(const struct net *net, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) - return match; + if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { + const struct fib6_nh *nh = &sibling->fib6_nh; int nh_upper_bound; - nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (fl6->mp_hash > nh_upper_bound) continue; - if (rt6_score_route(sibling, oif, strict) < 0) + if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } - return match; +out: + res->f6i = match; + res->nh = &match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ -static inline struct fib6_info *rt6_device_match(struct net *net, - struct fib6_info *rt, - const struct in6_addr *saddr, - int oif, - int flags) +static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, + const struct in6_addr *saddr, int oif, int flags) { - struct fib6_info *sprt; + const struct net_device *dev; - if (!oif && ipv6_addr_any(saddr) && - !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) - return rt; + if (nh->fib_nh_flags & RTNH_F_DEAD) + return false; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { - const struct net_device *dev = sprt->fib6_nh.nh_dev; + dev = nh->fib_nh_dev; + if (oif) { + if (dev->ifindex == oif) + return true; + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return true; + } - if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) - continue; + return false; +} - if (oif) { - if (dev->ifindex == oif) - return sprt; - } else { - if (ipv6_chk_addr(net, saddr, dev, - flags & RT6_LOOKUP_F_IFACE)) - return sprt; +static void rt6_device_match(struct net *net, struct fib6_result *res, + const struct in6_addr *saddr, int oif, int flags) +{ + struct fib6_info *f6i = res->f6i; + struct fib6_info *spf6i; + struct fib6_nh *nh; + + if (!oif && ipv6_addr_any(saddr)) { + nh = &f6i->fib6_nh; + if (!(nh->fib_nh_flags & RTNH_F_DEAD)) + goto out; + } + + for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { + nh = &spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) { + res->f6i = spf6i; + goto out; } } - if (oif && flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.fib6_null_entry; + if (oif && flags & RT6_LOOKUP_F_IFACE) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + goto out; + } - return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; + nh = &f6i->fib6_nh; + if (nh->fib_nh_flags & RTNH_F_DEAD) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + } +out: + res->nh = nh; + res->fib6_type = res->f6i->fib6_type; + res->fib6_flags = res->f6i->fib6_flags; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -517,7 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct fib6_info *rt) +static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; @@ -533,11 +561,11 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) + if (fib6_nh->fib_nh_gw_family) return; - nh_gw = &rt->fib6_nh.nh_gw; - dev = rt->fib6_nh.nh_dev; + nh_gw = &fib6_nh->fib_nh_gw6; + dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); @@ -554,13 +582,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, rt->last_probe + + } else if (time_after(jiffies, fib6_nh->last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { - rt->last_probe = jiffies; + fib6_nh->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -572,7 +600,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct fib6_info *rt) +static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif @@ -580,27 +608,14 @@ static inline void rt6_probe(struct fib6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct fib6_info *rt, int oif) -{ - const struct net_device *dev = rt->fib6_nh.nh_dev; - - if (!oif || dev->ifindex == oif) - return 2; - return 0; -} - -static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) +static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->fib6_flags & RTF_NONEXTHOP || - !(rt->fib6_flags & RTF_GATEWAY)) - return RT6_NUD_SUCCEED; - rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, - &rt->fib6_nh.nh_gw); + neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, + &fib6_nh->fib_nh_gw6); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -621,58 +636,44 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) return ret; } -static int rt6_score_route(struct fib6_info *rt, int oif, int strict) +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict) { - int m; + int m = 0; + + if (!oif || nh->fib_nh_dev->ifindex == oif) + m = 2; - m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif - if (strict & RT6_LOOKUP_F_REACHABLE) { - int n = rt6_check_neigh(rt); + if ((strict & RT6_LOOKUP_F_REACHABLE) && + !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { + int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } -/* called with rc_read_lock held */ -static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) +static bool find_match(struct fib6_nh *nh, u32 fib6_flags, + int oif, int strict, int *mpri, bool *do_rr) { - const struct net_device *dev = fib6_info_nh_dev(f6i); + bool match_do_rr = false; bool rc = false; - - if (dev) { - const struct inet6_dev *idev = __in6_dev_get(dev); - - rc = !!idev->cnf.ignore_routes_with_linkdown; - } - - return rc; -} - -static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, - int *mpri, struct fib6_info *match, - bool *do_rr) -{ int m; - bool match_do_rr = false; - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; - if (fib6_ignore_linkdown(rt) && - rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && + if (ip6_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (fib6_check_expired(rt)) - goto out; - - m = rt6_score_route(rt, oif, strict); + m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ @@ -681,67 +682,82 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, } if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(rt); + rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; - match = rt; + rc = true; } out: - return match; + return rc; } -static struct fib6_info *find_rr_leaf(struct fib6_node *fn, - struct fib6_info *leaf, - struct fib6_info *rr_head, - u32 metric, int oif, int strict, - bool *do_rr) +static void __find_rr_leaf(struct fib6_info *f6i_start, + struct fib6_info *nomatch, u32 metric, + struct fib6_result *res, struct fib6_info **cont, + int oif, int strict, bool *do_rr, int *mpri) { - struct fib6_info *rt, *match, *cont; - int mpri = -1; + struct fib6_info *f6i; - match = NULL; - cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + for (f6i = f6i_start; + f6i && f6i != nomatch; + f6i = rcu_dereference(f6i->fib6_next)) { + struct fib6_nh *nh; + + if (cont && f6i->fib6_metric != metric) { + *cont = f6i; + return; } - match = find_match(rt, oif, strict, &mpri, match, do_rr); - } + if (fib6_check_expired(f6i)) + continue; - for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + nh = &f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + res->f6i = f6i; + res->nh = nh; + res->fib6_flags = f6i->fib6_flags; + res->fib6_type = f6i->fib6_type; } - - match = find_match(rt, oif, strict, &mpri, match, do_rr); } +} + +static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, + struct fib6_info *rr_head, int oif, int strict, + bool *do_rr, struct fib6_result *res) +{ + u32 metric = rr_head->fib6_metric; + struct fib6_info *cont = NULL; + int mpri = -1; - if (match || !cont) - return match; + __find_rr_leaf(rr_head, NULL, metric, res, &cont, + oif, strict, do_rr, &mpri); - for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) - match = find_match(rt, oif, strict, &mpri, match, do_rr); + __find_rr_leaf(leaf, rr_head, metric, res, &cont, + oif, strict, do_rr, &mpri); - return match; + if (res->f6i || !cont) + return; + + __find_rr_leaf(cont, NULL, metric, res, NULL, + oif, strict, do_rr, &mpri); } -static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, - int oif, int strict) +static void rt6_select(struct net *net, struct fib6_node *fn, int oif, + struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); - struct fib6_info *match, *rt0; + struct fib6_info *rt0; bool do_rr = false; int key_plen; + /* make sure this function or its helpers sets f6i */ + res->f6i = NULL; + if (!leaf || leaf == net->ipv6.fib6_null_entry) - return net->ipv6.fib6_null_entry; + goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -758,11 +774,9 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.fib6_null_entry; - - match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, - &do_rr); + goto out; + find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); @@ -779,12 +793,19 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, } } - return match ? match : net->ipv6.fib6_null_entry; +out: + if (!res->f6i) { + res->f6i = net->ipv6.fib6_null_entry; + res->nh = &res->f6i->fib6_nh; + res->fib6_flags = res->f6i->fib6_flags; + res->fib6_type = res->f6i->fib6_type; + } } -static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { - return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); + return (res->f6i->fib6_flags & RTF_NONEXTHOP) || + res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -868,17 +889,17 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, */ /* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) +static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { - struct net_device *dev = rt->fib6_nh.nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; - if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { + if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->fib6_dst.addr)) + !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; @@ -924,11 +945,11 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt) return flags; } -static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { - rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); + rt->dst.error = ip6_rt_type_to_error(fib6_type); - switch (ort->fib6_type) { + switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; @@ -946,26 +967,28 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) } } -static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { - if (ort->fib6_flags & RTF_REJECT) { - ip6_rt_init_dst_reject(rt, ort); + struct fib6_info *f6i = res->f6i; + + if (res->fib6_flags & RTF_REJECT) { + ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; - if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { + if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; - } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { + } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; } - if (ort->fib6_nh.nh_lwtstate) { - rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); + if (res->nh->fib_nh_lws) { + rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } @@ -980,20 +1003,25 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } -/* Caller must already hold reference to @ort */ -static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) +/* Caller must already hold reference to f6i in result */ +static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { - struct net_device *dev = fib6_info_nh_dev(ort); + const struct fib6_nh *nh = res->nh; + const struct net_device *dev = nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; - ip6_rt_init_dst(rt, ort); + ip6_rt_init_dst(rt, res); - rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; - rt->rt6i_gateway = ort->fib6_nh.nh_gw; - rt->rt6i_flags = ort->fib6_flags; - rt6_set_from(rt, ort); + rt->rt6i_flags = res->fib6_flags; + if (nh->fib_nh_gw_family) { + rt->rt6i_gateway = nh->fib_nh_gw6; + rt->rt6i_flags |= RTF_GATEWAY; + } + rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->fib6_src; + rt->rt6i_src = f6i->fib6_src; #endif } @@ -1015,14 +1043,13 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } -static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, - bool null_fallback) +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; - if (null_fallback) { + if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { @@ -1033,22 +1060,24 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, } /* called with rcu_lock held */ -static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) +static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); - struct net_device *dev = rt->fib6_nh.nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; + unsigned short flags; struct rt6_info *nrt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) goto fallback; + flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { - fib6_info_release(rt); + fib6_info_release(f6i); goto fallback; } - ip6_rt_copy_init(nrt, rt); + ip6_rt_copy_init(nrt, res); return nrt; fallback: @@ -1063,7 +1092,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; @@ -1073,37 +1102,38 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - f6i = rcu_dereference(fn->leaf); - if (!f6i) { - f6i = net->ipv6.fib6_null_entry; - } else { - f6i = rt6_device_match(net, f6i, &fl6->saddr, - fl6->flowi6_oif, flags); - if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = fib6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, - flags); - } - if (f6i == net->ipv6.fib6_null_entry) { + res.f6i = rcu_dereference(fn->leaf); + if (!res.f6i) + res.f6i = net->ipv6.fib6_null_entry; + else + rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, + flags); + + if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; + + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + goto out; } - trace_fib6_table_lookup(net, f6i, table, fl6); + fib6_select_path(net, &res, fl6, fl6->flowi6_oif, + fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt, true)) + if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); - } else if (f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); } else { - rt = ip6_create_rt_rcu(f6i); + rt = ip6_create_rt_rcu(&res); } +out: + trace_fib6_table_lookup(net, &res, table, fl6); + rcu_read_unlock(); return rt; @@ -1169,10 +1199,11 @@ int ip6_ins_rt(struct net *net, struct fib6_info *rt) return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; @@ -1180,25 +1211,25 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ - if (!fib6_info_hold_safe(ort)) + if (!fib6_info_hold_safe(f6i)) return NULL; - dev = ip6_rt_get_dev_rcu(ort); + dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { - fib6_info_release(ort); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(rt, ort); + ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; - if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->fib6_dst.plen != 128 && - ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) + if (!rt6_is_gw_or_nonexthop(res)) { + if (f6i->fib6_dst.plen != 128 && + ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1211,55 +1242,56 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); + struct fib6_info *f6i = res->f6i; + unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); - dev = ip6_rt_get_dev_rcu(rt); + dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) { - fib6_info_release(rt); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(pcpu_rt, rt); + ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt, **p; - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt, false); + ip6_hold_safe(NULL, &pcpu_rt); return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, - struct fib6_info *rt) + const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; - pcpu_rt = ip6_rt_pcpu_alloc(rt); + pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) { dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1288,9 +1320,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = rcu_dereference_protected(rt6_ex->rt6i->from, - lockdep_is_held(&rt6_exception_lock)); - rcu_assign_pointer(rt6_ex->rt6i->from, NULL); + from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); @@ -1402,14 +1432,15 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } -static unsigned int fib6_mtu(const struct fib6_info *rt) +static unsigned int fib6_mtu(const struct fib6_result *res) { + const struct fib6_nh *nh = res->nh; unsigned int mtu; - if (rt->fib6_pmtu) { - mtu = rt->fib6_pmtu; + if (res->f6i->fib6_pmtu) { + mtu = res->f6i->fib6_pmtu; } else { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); @@ -1420,26 +1451,27 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } static int rt6_insert_exception(struct rt6_info *nrt, - struct fib6_info *ort) + const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; + struct fib6_info *f6i = res->f6i; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (ort->exception_bucket_flushed) { + if (f6i->exception_bucket_flushed) { err = -EINVAL; goto out; } - bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), @@ -1448,24 +1480,24 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); } #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates ort is in subtree + /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (ort->fib6_src.plen) + if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif - /* rt6_mtu_change() might lower mtu on ort. + /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu - * is less than ort's mtu value. + * is less than f6i's mtu value. */ - if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } @@ -1494,9 +1526,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->fib6_table->tb6_lock); - fib6_update_sernum(net, ort); - spin_unlock_bh(&ort->fib6_table->tb6_lock); + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1533,33 +1565,33 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct rt6_info *res = NULL; + struct rt6_info *ret = NULL; - bucket = rcu_dereference(rt->rt6i_exception_bucket); + bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates rt is in subtree + /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (rt->fib6_src.plen) + if (res->f6i->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) - res = rt6_ex->rt6i; + ret = rt6_ex->rt6i; - return res; + return ret; } /* Remove the passed in cached rt from the hash table that contains it */ @@ -1807,11 +1839,10 @@ void rt6_age_exceptions(struct fib6_info *rt, } /* must be called with rcu lock held */ -struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int strict) +int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; - struct fib6_info *f6i; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1820,8 +1851,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - f6i = rt6_select(net, fn, oif, strict); - if (f6i == net->ipv6.fib6_null_entry) { + rt6_select(net, fn, oif, res, strict); + if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1833,16 +1864,16 @@ redo_rt6_select: } } - trace_fib6_table_lookup(net, f6i, table, fl6); + trace_fib6_table_lookup(net, res, table, fl6); - return f6i; + return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct rt6_info *rt; int strict = 0; @@ -1853,27 +1884,26 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); - f6i = fib6_table_lookup(net, table, oif, fl6, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); - - if (f6i == net->ipv6.fib6_null_entry) { + fib6_table_lookup(net, table, oif, fl6, &res, strict); + if (res.f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); return rt; } + fib6_select_path(net, &res, fl6, oif, false, skb, strict); + /*Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt, true)) + if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); rcu_read_unlock(); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(f6i->fib6_flags & RTF_GATEWAY))) { + !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -1881,7 +1911,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, */ struct rt6_info *uncached_rt; - uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); + uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); rcu_read_unlock(); @@ -1903,10 +1933,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, struct rt6_info *pcpu_rt; local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(f6i); + pcpu_rt = rt6_get_pcpu_route(&res); if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, f6i); + pcpu_rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); rcu_read_unlock(); @@ -2325,15 +2355,23 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { - struct fib6_info *from; + struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); - from = rcu_dereference(rt6->from); - nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); + res.f6i = rcu_dereference(rt6->from); + if (!res.f6i) { + rcu_read_unlock(); + return; + } + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, from)) + if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } rcu_read_unlock(); @@ -2406,6 +2444,36 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, NULL); } +static bool ip6_redirect_nh_match(const struct fib6_result *res, + struct flowi6 *fl6, + const struct in6_addr *gw, + struct rt6_info **ret) +{ + const struct fib6_nh *nh = res->nh; + + if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || + fl6->flowi6_oif != nh->fib_nh_dev->ifindex) + return false; + + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { + *ret = rt_cache; + return true; + } + return false; + } + return true; +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2419,7 +2487,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *ret = NULL; + struct fib6_result res = {}; struct fib6_info *rt; struct fib6_node *fn; @@ -2437,34 +2506,15 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) - continue; + res.f6i = rt; + res.nh = &rt->fib6_nh; + if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (!(rt->fib6_flags & RTF_GATEWAY)) - continue; - if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) - continue; - /* rt_cache's gateway might be different from its 'parent' - * in the case of an ip redirect. - * So we keep searching in the exception table if the gateway - * is different. - */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { - rt_cache = rt6_find_cached_rt(rt, - &fl6->daddr, - &fl6->saddr); - if (rt_cache && - ipv6_addr_equal(&rdfl->gateway, - &rt_cache->rt6i_gateway)) { - ret = rt_cache; - break; - } - continue; - } - break; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) + goto out; } if (!rt) @@ -2480,15 +2530,20 @@ restart: goto restart; } + res.f6i = rt; + res.nh = &rt->fib6_nh; out: - if (ret) - ip6_hold_safe(net, &ret, true); - else - ret = ip6_create_rt_rcu(rt); + if (ret) { + ip6_hold_safe(net, &ret); + } else { + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + ret = ip6_create_rt_rcu(&res); + } rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); + trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; @@ -2606,12 +2661,15 @@ out: * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ -u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr) +u32 ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; + const struct fib6_nh *nh = res->nh; + struct fib6_info *f6i = res->f6i; + const struct in6_addr *src_key; struct rt6_exception *rt6_ex; - struct in6_addr *src_key; struct inet6_dev *idev; u32 mtu = 0; @@ -2633,7 +2691,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); if (likely(!mtu)) { - struct net_device *dev = fib6_info_nh_dev(f6i); + struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); @@ -2643,7 +2701,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: - return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, @@ -2899,17 +2957,143 @@ out: return err; } +static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) +{ + if ((flags & RTF_REJECT) || + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(flags & RTF_LOCAL))) + return true; + + return false; +} + +int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + int addr_type; + int err; + + fib6_nh->fib_nh_family = AF_INET6; + + err = -ENODEV; + if (cfg->fc_ifindex) { + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + + fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; + } + + fib6_nh->fib_nh_weight = 1; + + /* We cannot add true routes via loopback here, + * they would result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != net->loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = net->loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + goto set_dev; + } + + if (cfg->fc_flags & RTF_GATEWAY) { + err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + if (err) + goto out; + + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + } + + err = -ENODEV; + if (!dev) + goto out; + + if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); + err = -EACCES; + goto out; + } + + if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + + if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + + err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, + cfg->fc_encap_type, cfg, gfp_flags, extack); + if (err) + goto out; +set_dev: + fib6_nh->fib_nh_dev = dev; + fib6_nh->fib_nh_oif = dev->ifindex; + err = 0; +out: + if (idev) + in6_dev_put(idev); + + if (err) { + lwtstate_put(fib6_nh->fib_nh_lws); + fib6_nh->fib_nh_lws = NULL; + if (dev) + dev_put(dev); + } + + return err; +} + +void fib6_nh_release(struct fib6_nh *fib6_nh) +{ + fib_nh_common_release(&fib6_nh->nh_common); +} + static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; - struct net_device *dev = NULL; - struct inet6_dev *idev = NULL; struct fib6_table *table; - int addr_type; int err = -EINVAL; + int addr_type; /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { @@ -2943,33 +3127,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif - if (cfg->fc_ifindex) { - err = -ENODEV; - dev = dev_get_by_index(net, cfg->fc_ifindex); - if (!dev) - goto out; - idev = in6_dev_get(dev); - if (!idev) - goto out; - } - - if (cfg->fc_metric == 0) - cfg->fc_metric = IP6_RT_PRIO_USER; - - if (cfg->fc_flags & RTNH_F_ONLINK) { - if (!dev) { - NL_SET_ERR_MSG(extack, - "Nexthop device required for onlink"); - err = -ENODEV; - goto out; - } - - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } - } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3013,18 +3170,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; - addr_type = ipv6_addr_type(&cfg->fc_dst); - - if (cfg->fc_encap) { - struct lwtunnel_state *lwtstate; - - err = lwtunnel_build_state(cfg->fc_encap_type, - cfg->fc_encap, AF_INET6, cfg, - &lwtstate, extack); - if (err) - goto out; - rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); - } + rt->fib6_table = table; + rt->fib6_metric = cfg->fc_metric; + rt->fib6_type = cfg->fc_type; + rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; @@ -3035,62 +3184,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - - rt->fib6_metric = cfg->fc_metric; - rt->fib6_nh.nh_weight = 1; - - rt->fib6_type = cfg->fc_type; + err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; /* We cannot add true routes via loopback here, - they would result in kernel looping; promote them to reject routes + * they would result in kernel looping; promote them to reject routes */ - if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags & IFF_LOOPBACK) && - !(addr_type & IPV6_ADDR_LOOPBACK) && - !(cfg->fc_flags & RTF_LOCAL))) { - /* hold loopback dev/idev if we haven't done so. */ - if (dev != net->loopback_dev) { - if (dev) { - dev_put(dev); - in6_dev_put(idev); - } - dev = net->loopback_dev; - dev_hold(dev); - idev = in6_dev_get(dev); - if (!idev) { - err = -ENODEV; - goto out; - } - } - rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; - goto install_route; - } - - if (cfg->fc_flags & RTF_GATEWAY) { - err = ip6_validate_gw(net, cfg, &dev, &idev, extack); - if (err) - goto out; - - rt->fib6_nh.nh_gw = cfg->fc_gateway; - } - - err = -ENODEV; - if (!dev) - goto out; - - if (idev->cnf.disable_ipv6) { - NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); - err = -EACCES; - goto out; - } - - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + struct net_device *dev = fib6_info_nh_dev(rt); + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; @@ -3101,26 +3208,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else rt->fib6_prefsrc.plen = 0; - rt->fib6_flags = cfg->fc_flags; - -install_route: - if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && - !netif_carrier_ok(dev)) - rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; - rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->fib6_nh.nh_dev = dev; - rt->fib6_table = table; - - if (idev) - in6_dev_put(idev); - return rt; out: - if (dev) - dev_put(dev); - if (idev) - in6_dev_put(idev); - fib6_info_release(rt); return ERR_PTR(err); } @@ -3261,10 +3350,16 @@ static int ip6_route_del(struct fib6_config *cfg, if (fn) { for_each_fib6_node_rt_rcu(fn) { + struct fib6_nh *nh; + if (cfg->fc_flags & RTF_CACHE) { + struct fib6_result res = { + .f6i = rt, + }; int rc; - rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + rt_cache = rt6_find_cached_rt(&res, + &cfg->fc_dst, &cfg->fc_src); if (rt_cache) { rc = ip6_del_cached_rt(rt_cache, cfg); @@ -3275,12 +3370,14 @@ static int ip6_route_del(struct fib6_config *cfg, } continue; } + + nh = &rt->fib6_nh; if (cfg->fc_ifindex && - (!rt->fib6_nh.nh_dev || - rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) + (!nh->fib_nh_dev || + nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && - !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) + !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; @@ -3306,10 +3403,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; + struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; - struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3392,14 +3489,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NDISC_REDIRECT, &ndopts); rcu_read_lock(); - from = rcu_dereference(rt->from); - /* This fib6_info_hold() is safe here because we hold reference to rt - * and rt already holds reference to fib6_info. - */ - fib6_info_hold(from); - rcu_read_unlock(); + res.f6i = rcu_dereference(rt->from); + if (!res.f6i) + goto out; - nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; @@ -3409,11 +3506,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - /* No need to remove rt from the exception table if rt is - * a cached route because rt6_insert_exception() will - * takes care of it - */ - if (rt6_insert_exception(nrt, from)) { + /* rt6_insert_exception() will take care of duplicated exceptions */ + if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3425,7 +3519,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: - fib6_info_release(from); + rcu_read_unlock(); neigh_release(neigh); } @@ -3451,11 +3545,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.nh_dev->ifindex != ifindex) + if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) continue; - if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + if (!(rt->fib6_flags & RTF_ROUTEINFO) || + !rt->fib6_nh.fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3513,9 +3608,11 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (dev == rt->fib6_nh.nh_dev && + struct fib6_nh *nh = &rt->fib6_nh; + + if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && - ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) + ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) @@ -3606,7 +3703,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric, + .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -3664,23 +3761,34 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { - int type; struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; + int type; + + if (netif_is_l3_master(skb->dev) && + dst->dev == net->loopback_dev) + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); + else + idev = ip6_dst_idev(dst); + switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), - __in6_dev_get_safely(skb->dev), - IPSTATS_MIB_INADDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } /* FALLTHROUGH */ case IPSTATS_MIB_OUTNOROUTES: - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), - ipstats_mib_noroutes); + IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } + + /* Start over by dropping the dst for l3mdev case */ + if (netif_is_l3_master(skb->dev)) + skb_dst_drop(skb); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb(skb); return 0; @@ -3717,36 +3825,26 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags) { - u32 tb_id; - struct net_device *dev = idev->dev; - struct fib6_info *f6i; - - f6i = fib6_info_alloc(gfp_flags); - if (!f6i) - return ERR_PTR(-ENOMEM); + struct fib6_config cfg = { + .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, + .fc_ifindex = idev->dev->ifindex, + .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, + .fc_dst = *addr, + .fc_dst_len = 128, + .fc_protocol = RTPROT_KERNEL, + .fc_nlinfo.nl_net = net, + .fc_ignore_dev_down = true, + }; - f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); - f6i->dst_nocount = true; - f6i->dst_host = true; - f6i->fib6_protocol = RTPROT_KERNEL; - f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { - f6i->fib6_type = RTN_ANYCAST; - f6i->fib6_flags |= RTF_ANYCAST; + cfg.fc_type = RTN_ANYCAST; + cfg.fc_flags |= RTF_ANYCAST; } else { - f6i->fib6_type = RTN_LOCAL; - f6i->fib6_flags |= RTF_LOCAL; + cfg.fc_type = RTN_LOCAL; + cfg.fc_flags |= RTF_LOCAL; } - f6i->fib6_nh.nh_gw = *addr; - dev_hold(dev); - f6i->fib6_nh.nh_dev = dev; - f6i->fib6_dst.addr = *addr; - f6i->fib6_dst.plen = 128; - tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - f6i->fib6_table = fib6_get_table(net, tb_id); - - return f6i; + return ip6_route_info_create(&cfg, gfp_flags, NULL); } /* remove deleted ip from prefsrc entries */ @@ -3762,7 +3860,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && + if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3784,7 +3882,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) fib6_clean_all(net, fib6_remove_prefsrc, &adni); } -#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) @@ -3792,7 +3890,8 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { + rt->fib6_nh.fib_nh_gw_family && + ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { return -1; } @@ -3813,7 +3912,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) struct arg_netdev_event { const struct net_device *dev; union { - unsigned int nh_flags; + unsigned char nh_flags; unsigned long event; }; }; @@ -3840,9 +3939,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - fib6_ignore_linkdown(rt))) + if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) return true; return false; @@ -3854,11 +3953,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.nh_weight; + total += rt->fib6_nh.fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.nh_weight; + total += iter->fib6_nh.fib_nh_weight; } return total; @@ -3869,11 +3968,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.nh_weight; + *weight += rt->fib6_nh.fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -3916,8 +4015,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { - rt->fib6_nh.nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && + rt->fib6_nh.fib_nh_dev == arg->dev) { + rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3925,7 +4025,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) return 0; } -void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, @@ -3945,10 +4045,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.nh_dev == dev) + if (rt->fib6_nh.fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == dev) + if (iter->fib6_nh.fib_nh_dev == dev) return true; return false; @@ -3969,12 +4069,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.nh_dev == down_dev || - rt->fib6_nh.nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.fib_nh_dev == down_dev || + rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == down_dev || - iter->fib6_nh.nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh.fib_nh_dev == down_dev || + iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -3982,15 +4082,15 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, - unsigned int nh_flags) + unsigned char nh_flags) { struct fib6_info *iter; - if (rt->fib6_nh.nh_dev == dev) - rt->fib6_nh.nh_flags |= nh_flags; + if (rt->fib6_nh.fib_nh_dev == dev) + rt->fib6_nh.fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == dev) - iter->fib6_nh.nh_flags |= nh_flags; + if (iter->fib6_nh.fib_nh_dev == dev) + iter->fib6_nh.fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4005,12 +4105,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.nh_dev == dev ? -1 : 0; + return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.nh_dev == dev ? -1 : 0; + return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4026,10 +4126,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.nh_dev != dev || + if (rt->fib6_nh.fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4085,7 +4185,7 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - if (rt->fib6_nh.nh_dev == arg->dev && + if (rt->fib6_nh.fib_nh_dev == arg->dev && !fib6_metric_locked(rt, RTAX_MTU)) { u32 mtu = rt->fib6_pmtu; @@ -4139,8 +4239,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int pref; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); if (err < 0) goto errout; @@ -4376,7 +4476,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4526,6 +4626,9 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; + if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else @@ -4540,7 +4643,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); + + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); nexthop_len *= rt->fib6_nsiblings; } @@ -4558,77 +4661,10 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) + + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } -static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, - unsigned int *flags, bool skip_oif) -{ - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) - *flags |= RTNH_F_DEAD; - - if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { - *flags |= RTNH_F_LINKDOWN; - - rcu_read_lock(); - if (fib6_ignore_linkdown(rt)) - *flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - - if (rt->fib6_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) - goto nla_put_failure; - } - - *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); - if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) - *flags |= RTNH_F_OFFLOAD; - - /* not needed for multipath encoding b/c it has a rtnexthop struct */ - if (!skip_oif && rt->fib6_nh.nh_dev && - nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) - goto nla_put_failure; - - if (rt->fib6_nh.nh_lwtstate && - lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -/* add multipath next hop */ -static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) -{ - const struct net_device *dev = rt->fib6_nh.nh_dev; - struct rtnexthop *rtnh; - unsigned int flags = 0; - - rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); - if (!rtnh) - goto nla_put_failure; - - rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; - rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; - - if (rt6_nexthop_info(skb, rt, &flags, true) < 0) - goto nla_put_failure; - - rtnh->rtnh_flags = flags; - - /* length of rtnetlink header + attributes */ - rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4741,23 +4777,30 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *sibling, *next_sibling; struct nlattr *mp; - mp = nla_nest_start(skb, RTA_MULTIPATH); + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; - if (rt6_add_nexthop(skb, rt) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, + rt->fib6_nh.fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (rt6_add_nexthop(skb, sibling) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, + sibling->fib6_nh.fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); } else { - if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) + unsigned char nh_flags = 0; + + if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + &nh_flags, false) < 0) goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { @@ -4783,7 +4826,7 @@ nla_put_failure: static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.nh_dev == dev) + if (f6i->fib6_nh.fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4791,7 +4834,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.nh_dev == dev) + if (sibling->fib6_nh.fib_nh_dev == dev) return true; } } @@ -4843,8 +4886,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv6_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || @@ -4860,8 +4903,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv6_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); if (err) return err; @@ -5013,16 +5056,20 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); from = rcu_dereference(rt->from); - - if (fibmatch) - err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); - else - err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, - &fl6.saddr, iif, RTM_NEWROUTE, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - 0); + if (from) { + if (fibmatch) + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, + iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + else + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + } else { + err = -ENETUNREACH; + } rcu_read_unlock(); if (err < 0) { @@ -5076,7 +5123,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5411,7 +5458,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 9b2f272ca164..0c5479ef9b38 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -398,28 +398,28 @@ static struct pernet_operations ip6_segments_ops = { static const struct genl_ops seg6_genl_ops[] = { { .cmd = SEG6_CMD_SETHMAC, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_sethmac, - .policy = seg6_genl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_DUMPHMAC, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = seg6_genl_dumphmac_start, .dumpit = seg6_genl_dumphmac, .done = seg6_genl_dumphmac_done, - .policy = seg6_genl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_SET_TUNSRC, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_set_tunsrc, - .policy = seg6_genl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = SEG6_CMD_GET_TUNSRC, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = seg6_genl_get_tunsrc, - .policy = seg6_genl_policy, .flags = GENL_ADMIN_PERM, }, }; @@ -429,6 +429,7 @@ static struct genl_family seg6_genl_family __ro_after_init = { .name = SEG6_GENL_NAME, .version = SEG6_GENL_VERSION, .maxattr = SEG6_ATTR_MAX, + .policy = seg6_genl_policy, .netnsok = true, .parallel_ops = true, .ops = seg6_genl_ops, diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index ee5403cbe655..7a525fda8978 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -396,8 +396,8 @@ static int seg6_build_state(struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EINVAL; - err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla, - seg6_iptunnel_policy, extack); + err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla, + seg6_iptunnel_policy, extack); if (err < 0) return err; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 60325dbfe88b..78155fdb8c36 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -823,8 +823,9 @@ static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) int ret; u32 fd; - ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX, - attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL); + ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_BPF_PROG_MAX, + attrs[SEG6_LOCAL_BPF], + bpf_prog_policy, NULL); if (ret < 0) return ret; @@ -853,7 +854,7 @@ static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) if (!slwt->bpf.prog) return 0; - nest = nla_nest_start(skb, SEG6_LOCAL_BPF); + nest = nla_nest_start_noflag(skb, SEG6_LOCAL_BPF); if (!nest) return -EMSGSIZE; @@ -959,8 +960,8 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family, if (family != AF_INET6) return -EINVAL; - err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy, - extack); + err = nla_parse_nested_deprecated(tb, SEG6_LOCAL_MAX, nla, + seg6_local_policy, extack); if (err < 0) return err; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 07e21a82ce4c..971d60bf9640 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -669,6 +669,10 @@ static int ipip6_rcv(struct sk_buff *skb) !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; + /* skb can be uncloned in iptunnel_pull_header, so + * old iph is no longer valid + */ + iph = (const struct iphdr *)skb_mac_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) @@ -1080,7 +1084,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); - if (tdev) { + if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 44d431849d39..beaf28456301 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -43,6 +43,7 @@ #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> +#include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> @@ -90,6 +91,18 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, } #endif +/* Helper returning the inet6 address from a given tcp socket. + * It can be used in TCP stack instead of inet6_sk(sk). + * This avoids a dereference and allow compiler optimizations. + * It is a specialized version of inet6_sk_generic(). + */ +static struct ipv6_pinfo *tcp_inet6_sk(const struct sock *sk) +{ + unsigned int offset = sizeof(struct tcp6_sock) - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} + static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -99,7 +112,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } @@ -138,7 +151,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct ipv6_txoptions *opt; @@ -390,7 +403,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_CLOSE) goto out; - if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -405,7 +418,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - np = inet6_sk(sk); + np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { @@ -478,7 +491,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; @@ -737,7 +750,7 @@ static void tcp_v6_init_req(struct request_sock *req, { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); - const struct ipv6_pinfo *np = inet6_sk(sk_listener); + const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; @@ -1066,9 +1079,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; - struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1088,11 +1100,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (!newsk) return NULL; - newtcp6sk = (struct tcp6_sock *)newsk; - inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newinet = inet_sk(newsk); - newnp = inet6_sk(newsk); + newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); @@ -1156,12 +1167,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); - newtcp6sk = (struct tcp6_sock *)newsk; - inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); - newnp = inet6_sk(newsk); + newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); @@ -1276,9 +1286,9 @@ out: */ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp; + struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; + struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. @@ -1426,8 +1436,9 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } -static int tcp_v6_rcv(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; @@ -1524,7 +1535,7 @@ process: return 0; } } - if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } @@ -1554,12 +1565,17 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { + skb_to_free = sk->sk_rx_skb_cache; + sk->sk_rx_skb_cache = NULL; ret = tcp_v6_do_rcv(sk, skb); - } else if (tcp_add_backlog(sk, skb)) { - goto discard_and_relse; + } else { + if (tcp_add_backlog(sk, skb)) + goto discard_and_relse; + skb_to_free = NULL; } bh_unlock_sock(sk); - + if (skb_to_free) + __kfree_skb(skb_to_free); put_and_return: if (refcounted) sock_put(sk); @@ -1639,7 +1655,7 @@ do_time_wait: goto discard_it; } -static void tcp_v6_early_demux(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) { const struct ipv6hdr *hdr; const struct tcphdr *th; @@ -1669,7 +1685,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) - dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); if (dst && inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b444483cdb2b..07fa579dfb96 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -36,6 +36,7 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/indirect_call_wrapper.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -285,8 +286,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, peeking, off; - int err; + int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); struct udp_mib __percpu *mib; bool checksum_valid = false; @@ -299,9 +299,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = flags & MSG_PEEK; off = sk_peek_offset(sk, flags); - skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); if (!skb) return err; @@ -340,14 +339,14 @@ try_again: goto csum_copy_err; } if (unlikely(err)) { - if (!peeked) { + if (!peeking) { atomic_inc(&sk->sk_drops); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); return err; } - if (!peeked) + if (!peeking) SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS); sock_recv_ts_and_drops(msg, sk, skb); @@ -982,7 +981,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, return NULL; } -static void udp_v6_early_demux(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct udphdr *uh; @@ -1023,7 +1022,7 @@ static void udp_v6_early_demux(struct sk_buff *skb) } } -static __inline__ int udpv6_rcv(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); } @@ -1047,6 +1046,8 @@ static void udp_v6_flush_pending_frames(struct sock *sk) static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; /* The following checks are replicated from __ip6_datagram_connect() * and intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len. diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c deleted file mode 100644 index 57fd314ec2b8..000000000000 --- a/net/ipv6/xfrm6_mode_beet.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6. - * - * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com> - * Miika Komu <miika@iki.fi> - * Herbert Xu <herbert@gondor.apana.org.au> - * Abhinav Pathak <abhinav.pathak@hiit.fi> - * Jeff Ahrenholz <ahrenholz@gmail.com> - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dsfield.h> -#include <net/dst.h> -#include <net/inet_ecn.h> -#include <net/ipv6.h> -#include <net/xfrm.h> - -static void xfrm6_beet_make_header(struct sk_buff *skb) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - - iph->version = 6; - - memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, - sizeof(iph->flow_lbl)); - iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; - - ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); - iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; -} - -/* Add encapsulation header. - * - * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. - */ -static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipv6hdr *top_iph; - struct ip_beet_phdr *ph; - int optlen, hdr_len; - - hdr_len = 0; - optlen = XFRM_MODE_SKB_CB(skb)->optlen; - if (unlikely(optlen)) - hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); - - skb_set_network_header(skb, -x->props.header_len - hdr_len); - if (x->sel.family != AF_INET6) - skb->network_header += IPV4_BEET_PHMAXLEN; - skb->mac_header = skb->network_header + - offsetof(struct ipv6hdr, nexthdr); - skb->transport_header = skb->network_header + sizeof(*top_iph); - ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdr_len); - - xfrm6_beet_make_header(skb); - - top_iph = ipv6_hdr(skb); - if (unlikely(optlen)) { - - BUG_ON(optlen < 0); - - ph->padlen = 4 - (optlen & 4); - ph->hdrlen = optlen / 8; - ph->nexthdr = top_iph->nexthdr; - if (ph->padlen) - memset(ph + 1, IPOPT_NOP, ph->padlen); - - top_iph->nexthdr = IPPROTO_BEETPH; - } - - top_iph->saddr = *(struct in6_addr *)&x->props.saddr; - top_iph->daddr = *(struct in6_addr *)&x->id.daddr; - return 0; -} - -static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipv6hdr *ip6h; - int size = sizeof(struct ipv6hdr); - int err; - - err = skb_cow_head(skb, size + skb->mac_len); - if (err) - goto out; - - __skb_push(skb, size); - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - - xfrm6_beet_make_header(skb); - - ip6h = ipv6_hdr(skb); - ip6h->payload_len = htons(skb->len - size); - ip6h->daddr = x->sel.daddr.in6; - ip6h->saddr = x->sel.saddr.in6; - err = 0; -out: - return err; -} - -static struct xfrm_mode xfrm6_beet_mode = { - .input2 = xfrm6_beet_input, - .input = xfrm_prepare_input, - .output2 = xfrm6_beet_output, - .output = xfrm6_prepare_output, - .owner = THIS_MODULE, - .encap = XFRM_MODE_BEET, - .flags = XFRM_MODE_FLAG_TUNNEL, -}; - -static int __init xfrm6_beet_init(void) -{ - return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6); -} - -static void __exit xfrm6_beet_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6); - BUG_ON(err); -} - -module_init(xfrm6_beet_init); -module_exit(xfrm6_beet_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET); diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c deleted file mode 100644 index da28e4407b8f..000000000000 --- a/net/ipv6/xfrm6_mode_ro.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * xfrm6_mode_ro.c - Route optimization mode for IPv6. - * - * Copyright (C)2003-2006 Helsinki University of Technology - * Copyright (C)2003-2006 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - */ -/* - * Authors: - * Noriaki TAKAMIYA @USAGI - * Masahide NAKAMURA @USAGI - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> -#include <linux/stringify.h> -#include <linux/time.h> -#include <net/ipv6.h> -#include <net/xfrm.h> - -/* Add route optimization header space. - * - * The IP header and mutable extension headers will be moved forward to make - * space for the route optimization header. - */ -static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipv6hdr *iph; - u8 *prevhdr; - int hdr_len; - - iph = ipv6_hdr(skb); - - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); - if (hdr_len < 0) - return hdr_len; - skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); - skb_set_network_header(skb, -x->props.header_len); - skb->transport_header = skb->network_header + hdr_len; - __skb_pull(skb, hdr_len); - memmove(ipv6_hdr(skb), iph, hdr_len); - - x->lastused = ktime_get_real_seconds(); - - return 0; -} - -static struct xfrm_mode xfrm6_ro_mode = { - .output = xfrm6_ro_output, - .owner = THIS_MODULE, - .encap = XFRM_MODE_ROUTEOPTIMIZATION, -}; - -static int __init xfrm6_ro_init(void) -{ - return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6); -} - -static void __exit xfrm6_ro_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6); - BUG_ON(err); -} - -module_init(xfrm6_ro_init); -module_exit(xfrm6_ro_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION); diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c deleted file mode 100644 index 3c29da5defe6..000000000000 --- a/net/ipv6/xfrm6_mode_transport.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. - * - * Copyright (C) 2002 USAGI/WIDE Project - * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dst.h> -#include <net/ipv6.h> -#include <net/xfrm.h> -#include <net/protocol.h> - -/* Add encapsulation header. - * - * The IP header and mutable extension headers will be moved forward to make - * space for the encapsulation header. - */ -static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct ipv6hdr *iph; - u8 *prevhdr; - int hdr_len; - - iph = ipv6_hdr(skb); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - hdr_len = x->type->hdr_offset(x, skb, &prevhdr); - if (hdr_len < 0) - return hdr_len; - skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); - skb_set_network_header(skb, -x->props.header_len); - skb->transport_header = skb->network_header + hdr_len; - __skb_pull(skb, hdr_len); - memmove(ipv6_hdr(skb), iph, hdr_len); - return 0; -} - -/* Remove encapsulation header. - * - * The IP header will be moved over the top of the encapsulation header. - * - * On entry, skb->h shall point to where the IP header should be and skb->nh - * shall be set to where the IP header currently is. skb->data shall point - * to the start of the payload. - */ -static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int ihl = skb->data - skb_transport_header(skb); - - if (skb->transport_header != skb->network_header) { - memmove(skb_transport_header(skb), - skb_network_header(skb), ihl); - skb->network_header = skb->transport_header; - } - ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - - sizeof(struct ipv6hdr)); - skb_reset_transport_header(skb); - return 0; -} - -static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - const struct net_offload *ops; - struct sk_buff *segs = ERR_PTR(-EINVAL); - struct xfrm_offload *xo = xfrm_offload(skb); - - skb->transport_header += x->props.header_len; - ops = rcu_dereference(inet6_offloads[xo->proto]); - if (likely(ops && ops->callbacks.gso_segment)) - segs = ops->callbacks.gso_segment(skb, features); - - return segs; -} - -static void xfrm6_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + sizeof(struct ipv6hdr) + x->props.header_len); - - if (xo->flags & XFRM_GSO_SEGMENT) { - skb_reset_transport_header(skb); - skb->transport_header -= x->props.header_len; - } -} - - -static struct xfrm_mode xfrm6_transport_mode = { - .input = xfrm6_transport_input, - .output = xfrm6_transport_output, - .gso_segment = xfrm4_transport_gso_segment, - .xmit = xfrm6_transport_xmit, - .owner = THIS_MODULE, - .encap = XFRM_MODE_TRANSPORT, -}; - -static int __init xfrm6_transport_init(void) -{ - return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); -} - -static void __exit xfrm6_transport_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); - BUG_ON(err); -} - -module_init(xfrm6_transport_init); -module_exit(xfrm6_transport_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c deleted file mode 100644 index de1b0b8c53b0..000000000000 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. - * - * Copyright (C) 2002 USAGI/WIDE Project - * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> - */ - -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/stringify.h> -#include <net/dsfield.h> -#include <net/dst.h> -#include <net/inet_ecn.h> -#include <net/ip6_route.h> -#include <net/ipv6.h> -#include <net/xfrm.h> - -static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) -{ - struct ipv6hdr *inner_iph = ipipv6_hdr(skb); - - if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) - IP6_ECN_set_ce(skb, inner_iph); -} - -/* Add encapsulation header. - * - * The top IP header will be constructed per RFC 2401. - */ -static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - struct ipv6hdr *top_iph; - int dsfield; - - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); - - skb_set_network_header(skb, -x->props.header_len); - skb->mac_header = skb->network_header + - offsetof(struct ipv6hdr, nexthdr); - skb->transport_header = skb->network_header + sizeof(*top_iph); - top_iph = ipv6_hdr(skb); - - top_iph->version = 6; - - memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, - sizeof(top_iph->flow_lbl)); - top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); - - if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) - dsfield = 0; - else - dsfield = XFRM_MODE_SKB_CB(skb)->tos; - dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); - if (x->props.flags & XFRM_STATE_NOECN) - dsfield &= ~INET_ECN_MASK; - ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); - top_iph->saddr = *(struct in6_addr *)&x->props.saddr; - top_iph->daddr = *(struct in6_addr *)&x->id.daddr; - return 0; -} - -#define for_each_input_rcu(head, handler) \ - for (handler = rcu_dereference(head); \ - handler != NULL; \ - handler = rcu_dereference(handler->next)) - - -static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) -{ - int err = -EINVAL; - - if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) - goto out; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto out; - - err = skb_unclone(skb, GFP_ATOMIC); - if (err) - goto out; - - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), - ipipv6_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - - skb_reset_network_header(skb); - skb_mac_header_rebuild(skb); - if (skb->mac_len) - eth_hdr(skb)->h_proto = skb->protocol; - - err = 0; - -out: - return err; -} - -static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x, - struct sk_buff *skb, - netdev_features_t features) -{ - __skb_push(skb, skb->mac_len); - return skb_mac_gso_segment(skb, features); -} - -static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) -{ - struct xfrm_offload *xo = xfrm_offload(skb); - - if (xo->flags & XFRM_GSO_SEGMENT) - skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - - skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); -} - -static struct xfrm_mode xfrm6_tunnel_mode = { - .input2 = xfrm6_mode_tunnel_input, - .input = xfrm_prepare_input, - .output2 = xfrm6_mode_tunnel_output, - .output = xfrm6_prepare_output, - .gso_segment = xfrm6_mode_tunnel_gso_segment, - .xmit = xfrm6_mode_tunnel_xmit, - .owner = THIS_MODULE, - .encap = XFRM_MODE_TUNNEL, - .flags = XFRM_MODE_FLAG_TUNNEL, -}; - -static int __init xfrm6_mode_tunnel_init(void) -{ - return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); -} - -static void __exit xfrm6_mode_tunnel_exit(void) -{ - int err; - - err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); - BUG_ON(err); -} - -module_init(xfrm6_mode_tunnel_init); -module_exit(xfrm6_mode_tunnel_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6a74080005cf..8ad5e54eb8ca 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -111,21 +111,6 @@ int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) return xfrm6_extract_header(skb); } -int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm_inner_extract_output(x, skb); - if (err) - return err; - - skb->ignore_df = 1; - skb->protocol = htons(ETH_P_IPV6); - - return x->outer_mode->output2(x, skb); -} -EXPORT_SYMBOL(xfrm6_prepare_output); - int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); @@ -137,11 +122,28 @@ int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) return xfrm_output(sk, skb); } +static int __xfrm6_output_state_finish(struct xfrm_state *x, struct sock *sk, + struct sk_buff *skb) +{ + const struct xfrm_state_afinfo *afinfo; + int ret = -EAFNOSUPPORT; + + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); + if (likely(afinfo)) + ret = afinfo->output_finish(sk, skb); + else + kfree_skb(skb); + rcu_read_unlock(); + + return ret; +} + static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct xfrm_state *x = skb_dst(skb)->xfrm; - return x->outer_mode->afinfo->output_finish(sk, skb); + return __xfrm6_output_state_finish(x, sk, skb); } static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -183,7 +185,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) __xfrm6_output_finish); skip_frag: - return x->outer_mode->afinfo->output_finish(sk, skb); + return __xfrm6_output_state_finish(x, sk, skb); } int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 769f8f78d3b8..699e0730ce8e 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -22,9 +22,6 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> -#if IS_ENABLED(CONFIG_IPV6_MIP6) -#include <net/mip6.h> -#endif static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, @@ -71,24 +68,6 @@ static int xfrm6_get_saddr(struct net *net, int oif, return 0; } -static int xfrm6_get_tos(const struct flowi *fl) -{ - return 0; -} - -static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, - int nfheader_len) -{ - if (dst->ops->family == AF_INET6) { - struct rt6_info *rt = (struct rt6_info *)dst; - path->path_cookie = rt6_get_cookie(rt); - } - - path->u.rt6.rt6i_nfheader_len = nfheader_len; - - return 0; -} - static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { @@ -118,108 +97,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return 0; } -static inline void -_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) -{ - struct flowi6 *fl6 = &fl->u.ip6; - int onlyproto = 0; - const struct ipv6hdr *hdr = ipv6_hdr(skb); - u32 offset = sizeof(*hdr); - struct ipv6_opt_hdr *exthdr; - const unsigned char *nh = skb_network_header(skb); - u16 nhoff = IP6CB(skb)->nhoff; - int oif = 0; - u8 nexthdr; - - if (!nhoff) - nhoff = offsetof(struct ipv6hdr, nexthdr); - - nexthdr = nh[nhoff]; - - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; - - memset(fl6, 0, sizeof(struct flowi6)); - fl6->flowi6_mark = skb->mark; - fl6->flowi6_oif = reverse ? skb->skb_iif : oif; - - fl6->daddr = reverse ? hdr->saddr : hdr->daddr; - fl6->saddr = reverse ? hdr->daddr : hdr->saddr; - - while (nh + offset + sizeof(*exthdr) < skb->data || - pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { - nh = skb_network_header(skb); - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - - switch (nexthdr) { - case NEXTHDR_FRAGMENT: - onlyproto = 1; - /* fall through */ - case NEXTHDR_ROUTING: - case NEXTHDR_HOP: - case NEXTHDR_DEST: - offset += ipv6_optlen(exthdr); - nexthdr = exthdr->nexthdr; - exthdr = (struct ipv6_opt_hdr *)(nh + offset); - break; - - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_TCP: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - if (!onlyproto && (nh + offset + 4 < skb->data || - pskb_may_pull(skb, nh + offset + 4 - skb->data))) { - __be16 *ports; - - nh = skb_network_header(skb); - ports = (__be16 *)(nh + offset); - fl6->fl6_sport = ports[!!reverse]; - fl6->fl6_dport = ports[!reverse]; - } - fl6->flowi6_proto = nexthdr; - return; - - case IPPROTO_ICMPV6: - if (!onlyproto && (nh + offset + 2 < skb->data || - pskb_may_pull(skb, nh + offset + 2 - skb->data))) { - u8 *icmp; - - nh = skb_network_header(skb); - icmp = (u8 *)(nh + offset); - fl6->fl6_icmp_type = icmp[0]; - fl6->fl6_icmp_code = icmp[1]; - } - fl6->flowi6_proto = nexthdr; - return; - -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case IPPROTO_MH: - offset += ipv6_optlen(exthdr); - if (!onlyproto && (nh + offset + 3 < skb->data || - pskb_may_pull(skb, nh + offset + 3 - skb->data))) { - struct ip6_mh *mh; - - nh = skb_network_header(skb); - mh = (struct ip6_mh *)(nh + offset); - fl6->fl6_mh_type = mh->ip6mh_type; - } - fl6->flowi6_proto = nexthdr; - return; -#endif - - /* XXX Why are there these headers? */ - case IPPROTO_AH: - case IPPROTO_ESP: - case IPPROTO_COMP: - default: - fl6->fl6_ipsec_spi = 0; - fl6->flowi6_proto = nexthdr; - return; - } - } -} - static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu) { @@ -291,9 +168,6 @@ static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, - .decode_session = _decode_session6, - .get_tos = xfrm6_get_tos, - .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index cc979b702c89..aaacac7fdbce 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -46,7 +46,7 @@ static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol) handler != NULL; \ handler = rcu_dereference(handler->next)) \ -int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) +static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm6_protocol *handler; @@ -61,7 +61,6 @@ int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } -EXPORT_SYMBOL(xfrm6_rcv_cb); static int xfrm6_esp_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bc65db782bfb..d9e5f6808811 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -345,7 +345,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) unsigned int i; xfrm_flush_gc(); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); + xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); @@ -402,6 +402,10 @@ static void __exit xfrm6_tunnel_fini(void) xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + /* Someone maybe has gotten the xfrm6_tunnel_spi. + * So need to wait it. + */ + rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index c5c5ab6c5a1c..44fdc641710d 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -2054,14 +2054,14 @@ static int __init kcm_init(void) if (err) goto fail; - err = sock_register(&kcm_family_ops); - if (err) - goto sock_register_fail; - err = register_pernet_device(&kcm_net_ops); if (err) goto net_ops_fail; + err = sock_register(&kcm_family_ops); + if (err) + goto sock_register_fail; + err = kcm_proc_init(); if (err) goto proc_init_fail; @@ -2069,12 +2069,12 @@ static int __init kcm_init(void) return 0; proc_init_fail: - unregister_pernet_device(&kcm_net_ops); - -net_ops_fail: sock_unregister(PF_KCM); sock_register_fail: + unregister_pernet_device(&kcm_net_ops); + +net_ops_fail: proto_unregister(&kcm_proto); fail: @@ -2090,8 +2090,8 @@ fail: static void __exit kcm_exit(void) { kcm_proc_exit(); - unregister_pernet_device(&kcm_net_ops); sock_unregister(PF_KCM); + unregister_pernet_device(&kcm_net_ops); proto_unregister(&kcm_proto); destroy_workqueue(kcm_wq); diff --git a/net/key/af_key.c b/net/key/af_key.c index 5651c29cb5bd..4af1e1d60b9f 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1951,8 +1951,10 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; + if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) + return -EINVAL; - t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */ + t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index fed6becc5daf..e4dec03a19fe 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -169,8 +169,8 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (tunnel->tunnel_id == tunnel_id) { - l2tp_tunnel_inc_refcount(tunnel); + if (tunnel->tunnel_id == tunnel_id && + refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; @@ -190,8 +190,8 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) rcu_read_lock_bh(); list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { - if (++count > nth) { - l2tp_tunnel_inc_refcount(tunnel); + if (++count > nth && + refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } @@ -909,7 +909,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct l2tp_tunnel *tunnel; - tunnel = l2tp_tunnel(sk); + tunnel = rcu_dereference_sk_user_data(sk); if (tunnel == NULL) goto pass_up; @@ -1735,7 +1735,8 @@ static __net_exit void l2tp_exit_net(struct net *net) } rcu_read_unlock_bh(); - flush_workqueue(l2tp_wq); + if (l2tp_wq) + flush_workqueue(l2tp_wq); rcu_barrier(); for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index d4c60523c549..2cac910c1cd4 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -618,6 +618,7 @@ static const struct proto_ops l2tp_ip_ops = { .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 37a69df17cab..4ec546cc1dd6 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -752,6 +752,7 @@ static const struct proto_ops l2tp_ip6_ops = { .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index edbd5d1fbcde..6acc7f869b0c 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -345,7 +345,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; - nest = nla_nest_start(skb, L2TP_ATTR_STATS); + nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (nest == NULL) goto nla_put_failure; @@ -742,7 +742,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl session->reorder_timeout, L2TP_ATTR_PAD))) goto nla_put_failure; - nest = nla_nest_start(skb, L2TP_ATTR_STATS); + nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (nest == NULL) goto nla_put_failure; @@ -915,58 +915,58 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { static const struct genl_ops l2tp_nl_ops[] = { { .cmd = L2TP_CMD_NOOP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_noop, - .policy = l2tp_nl_policy, /* can be retrieved by unprivileged users */ }, { .cmd = L2TP_CMD_TUNNEL_CREATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_create, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_DELETE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_delete, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_MODIFY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_modify, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_get, .dumpit = l2tp_nl_cmd_tunnel_dump, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_CREATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_create, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_DELETE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_delete, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_MODIFY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_modify, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_get, .dumpit = l2tp_nl_cmd_session_dump, - .policy = l2tp_nl_policy, .flags = GENL_ADMIN_PERM, }, }; @@ -976,6 +976,7 @@ static struct genl_family l2tp_nl_family __ro_after_init = { .version = L2TP_GENL_VERSION, .hdrsize = 0, .maxattr = L2TP_ATTR_MAX, + .policy = l2tp_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = l2tp_nl_ops, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 04d9946dcdba..f36cae785e82 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1070,7 +1070,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; - int val; switch (cmd) { case PPPIOCGMRU: @@ -1097,7 +1096,7 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, if (!session->session_id && !session->peer_session_id) return -ENOSYS; - if (get_user(val, (int __user *)arg)) + if (!access_ok((int __user *)arg, sizeof(int))) return -EFAULT; break; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b99e73a7e7e0..2017b7d780f5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -320,14 +320,13 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) struct llc_sap *sap; int rc = -EINVAL; - dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); - lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; + dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 09dd1c2860fc..52e6a091b7e4 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -351,6 +351,36 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy, return 0; } +static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata, + const u8 *mac_addr, u8 key_idx) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_key *key; + struct sta_info *sta; + int ret = -EINVAL; + + if (!wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) + return -EINVAL; + + sta = sta_info_get_bss(sdata, mac_addr); + + if (!sta) + return -EINVAL; + + if (sta->ptk_idx == key_idx) + return 0; + + mutex_lock(&local->key_mtx); + key = key_mtx_dereference(local, sta->ptk[key_idx]); + + if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) + ret = ieee80211_set_tx_key(key); + + mutex_unlock(&local->key_mtx); + return ret; +} + static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) @@ -365,6 +395,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; + if (pairwise && params->mode == NL80211_KEY_SET_TX) + return ieee80211_set_tx(sdata, mac_addr, key_idx); + /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: @@ -396,6 +429,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, if (pairwise) key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; + if (params->mode == NL80211_KEY_NO_TX) + key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; + mutex_lock(&local->sta_mtx); if (mac_addr) { @@ -1421,6 +1457,15 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; + if (params->sta_modify_mask & STATION_PARAM_APPLY_STA_TXPOWER) { + sta->sta.txpwr.type = params->txpwr.type; + if (params->txpwr.type == NL80211_TX_POWER_LIMITED) + sta->sta.txpwr.power = params->txpwr.power; + ret = drv_sta_set_txpwr(local, sdata, sta); + if (ret) + return ret; + } + if (params->supported_rates) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, sband, params->supported_rates, @@ -3990,4 +4035,5 @@ const struct cfg80211_ops mac80211_config_ops = { .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats, .start_pmsr = ieee80211_start_pmsr, .abort_pmsr = ieee80211_abort_pmsr, + .probe_mesh_link = ieee80211_probe_mesh_link, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2d43bc127043..0d462206eef6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -150,6 +150,58 @@ static const struct file_operations aqm_ops = { .llseek = default_llseek, }; +static ssize_t force_tx_status_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[3]; + int len = 0; + + len = scnprintf(buf, sizeof(buf), "%d\n", (int)local->force_tx_status); + + return simple_read_from_buffer(user_buf, count, ppos, + buf, len); +} + +static ssize_t force_tx_status_write(struct file *file, + const char __user *user_buf, + size_t count, + loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[3]; + size_t len; + + if (count > sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + buf[sizeof(buf) - 1] = '\0'; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = 0; + + if (buf[0] == '0' && buf[1] == '\0') + local->force_tx_status = 0; + else if (buf[0] == '1' && buf[1] == '\0') + local->force_tx_status = 1; + else + return -EINVAL; + + return count; +} + +static const struct file_operations force_tx_status_ops = { + .write = force_tx_status_write, + .read = force_tx_status_read, + .open = simple_open, + .llseek = default_llseek, +}; + #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) @@ -221,6 +273,7 @@ static const char *hw_flag_names[] = { FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), + FLAG(EXT_KEY_ID_NATIVE), #undef FLAG }; @@ -382,6 +435,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); + DEBUGFS_ADD_MODE(force_tx_status, 0600); if (local->ops->wake_tx_queue) DEBUGFS_ADD_MODE(aqm, 0600); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index cff0fb3578c9..deb3faf08337 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -841,7 +841,7 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) dir = sdata->vif.debugfs_dir; - if (!dir) + if (IS_ERR_OR_NULL(dir)) return; sprintf(buf, "netdev:%s", sdata->name); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index bb886e7db47f..839c0022a29c 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -138,6 +138,27 @@ int drv_sta_state(struct ieee80211_local *local, return ret; } +__must_check +int drv_sta_set_txpwr(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + int ret = -EOPNOTSUPP; + + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_sta_set_txpwr(local, sdata, &sta->sta); + if (local->ops->sta_set_txpwr) + ret = local->ops->sta_set_txpwr(&local->hw, &sdata->vif, + &sta->sta); + trace_drv_return_int(local, ret); + return ret; +} + void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 28d022a3eee3..c2d8b5451a5e 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -529,6 +529,11 @@ int drv_sta_state(struct ieee80211_local *local, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state); +__must_check +int drv_sta_set_txpwr(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta); + void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed); @@ -1195,6 +1200,9 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); + if (local->in_reconfig) + return; + if (!check_sdata_in_driver(sdata)) return; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index e03c46ac8e4d..c62101857b9b 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -112,8 +112,9 @@ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, IEEE80211_HT_CAP_TX_STBC); /* Allow user to configure RX STBC bits */ - if (ht_capa_mask->cap_info & IEEE80211_HT_CAP_RX_STBC) - ht_cap->cap |= ht_capa->cap_info & IEEE80211_HT_CAP_RX_STBC; + if (ht_capa_mask->cap_info & cpu_to_le16(IEEE80211_HT_CAP_RX_STBC)) + ht_cap->cap |= le16_to_cpu(ht_capa->cap_info) & + IEEE80211_HT_CAP_RX_STBC; /* Allow user to decrease AMPDU factor */ if (ht_capa_mask->ampdu_params_info & diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e170f986d226..073a8235ae1b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1269,7 +1269,7 @@ struct ieee80211_local { /* * Key mutex, protects sdata's key_list and sta_info's - * key pointers (write access, they're RCU.) + * key pointers and ptk_idx (write access, they're RCU.) */ struct mutex key_mtx; @@ -1384,6 +1384,7 @@ struct ieee80211_local { struct dentry *rcdir; struct dentry *keys; } debugfs; + bool force_tx_status; #endif /* @@ -1505,7 +1506,6 @@ struct ieee802_11_elems { const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie; const struct ieee80211_bssid_index *bssid_index; - const u8 *nontransmitted_bssid_profile; u8 max_bssid_indicator; u8 dtim_count; u8 dtim_period; @@ -1761,7 +1761,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, - u32 info_flags); + u32 info_flags, + u32 ctrl_flags); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); struct sk_buff * @@ -1778,6 +1779,8 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted); +int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 4a6ff1482a9f..410685d38c46 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1133,8 +1133,7 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1179,8 +1178,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -1227,6 +1225,7 @@ static void ieee80211_if_setup(struct net_device *dev) static void ieee80211_if_setup_no_queue(struct net_device *dev) { ieee80211_if_setup(dev); + dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; } @@ -1764,13 +1763,13 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; - if (local->ops->wake_tx_queue) + if (local->ops->wake_tx_queue) { if_setup = ieee80211_if_setup_no_queue; - else + } else { if_setup = ieee80211_if_setup; - - if (local->hw.queues >= IEEE80211_NUM_ACS) - txqs = IEEE80211_NUM_ACS; + if (local->hw.queues >= IEEE80211_NUM_ACS) + txqs = IEEE80211_NUM_ACS; + } ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, @@ -1908,6 +1907,9 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata) list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); + if (sdata->vif.txq) + ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq)); + synchronize_rcu(); if (sdata->dev) { diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 4700718e010f..20bf9db7a388 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -140,6 +140,12 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * so clear that flag now to avoid trying to remove * it again later. */ + if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && + !(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + increment_tailroom_need_count(sdata); + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; return -EINVAL; } @@ -167,8 +173,10 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) * The driver doesn't know anything about VLAN interfaces. * Hence, don't send GTKs for VLAN interfaces to the driver. */ - if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) + if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { + ret = 1; goto out_unsupported; + } } ret = drv_set_key(key->local, SET_KEY, sdata, @@ -177,9 +185,9 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | - IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || - (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && @@ -213,11 +221,8 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) { - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - return 0; + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; - } return 0; default: return -EINVAL; @@ -243,9 +248,9 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta = key->sta; sdata = key->sdata; - if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | - IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || - (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; @@ -259,9 +264,24 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta ? sta->sta.addr : bcast_addr, ret); } +int ieee80211_set_tx_key(struct ieee80211_key *key) +{ + struct sta_info *sta = key->sta; + struct ieee80211_local *local = key->local; + struct ieee80211_key *old; + + assert_key_lock(local); + + old = key_mtx_dereference(local, sta->ptk[sta->ptk_idx]); + sta->ptk_idx = key->conf.keyidx; + ieee80211_check_fast_xmit(sta); + + return 0; +} + static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, struct ieee80211_key *new_key, - bool ptk0rekey) + bool pairwise) { struct ieee80211_sub_if_data *sdata; struct ieee80211_local *local; @@ -278,8 +298,9 @@ static int ieee80211_hw_key_replace(struct ieee80211_key *old_key, assert_key_lock(old_key->local); sta = old_key->sta; - /* PTK only using key ID 0 needs special handling on rekey */ - if (new_key && sta && ptk0rekey) { + /* Unicast rekey without Extended Key ID needs special handling */ + if (new_key && sta && pairwise && + rcu_access_pointer(sta->ptk[sta->ptk_idx]) == old_key) { local = old_key->local; sdata = old_key->sdata; @@ -395,10 +416,6 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (old) { idx = old->conf.keyidx; - /* TODO: proper implement and test "Extended Key ID for - * Individually Addressed Frames" from IEEE 802.11-2016. - * Till then always assume only key ID 0 is used for - * pairwise keys.*/ ret = ieee80211_hw_key_replace(old, new, pairwise); } else { /* new must be provided in case old is not */ @@ -415,15 +432,20 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (sta) { if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); - sta->ptk_idx = idx; - if (new) { + if (new && + !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) { + sta->ptk_idx = idx; clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); } } else { rcu_assign_pointer(sta->gtk[idx], new); } - if (new) + /* Only needed for transition from no key -> key. + * Still triggers unnecessary when using Extended Key ID + * and installing the second key ID the first time. + */ + if (new && !old) ieee80211_check_fast_rx(sta); } else { defunikey = old && @@ -739,16 +761,34 @@ int ieee80211_key_link(struct ieee80211_key *key, * can cause warnings to appear. */ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; - int ret; + int ret = -EOPNOTSUPP; mutex_lock(&sdata->local->key_mtx); - if (sta && pairwise) + if (sta && pairwise) { + struct ieee80211_key *alt_key; + old_key = key_mtx_dereference(sdata->local, sta->ptk[idx]); - else if (sta) + alt_key = key_mtx_dereference(sdata->local, sta->ptk[idx ^ 1]); + + /* The rekey code assumes that the old and new key are using + * the same cipher. Enforce the assumption for pairwise keys. + */ + if (key && + ((alt_key && alt_key->conf.cipher != key->conf.cipher) || + (old_key && old_key->conf.cipher != key->conf.cipher))) + goto out; + } else if (sta) { old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]); - else + } else { old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]); + } + + /* Non-pairwise keys must also not switch the cipher on rekey */ + if (!pairwise) { + if (key && old_key && old_key->conf.cipher != key->conf.cipher) + goto out; + } /* * Silently accept key re-installation without really installing the @@ -1188,9 +1228,9 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - if (!((key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | - IEEE80211_KEY_FLAG_PUT_MIC_SPACE)) || - (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(key->sdata); } diff --git a/net/mac80211/key.h b/net/mac80211/key.h index ebdb80b85dc3..f06fbd03d235 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -18,6 +18,7 @@ #define NUM_DEFAULT_KEYS 4 #define NUM_DEFAULT_MGMT_KEYS 2 +#define INVALID_PTK_KEYIDX 2 /* Keyidx always pointing to a NULL key for PTK */ struct ieee80211_local; struct ieee80211_sub_if_data; @@ -146,6 +147,7 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); +int ieee80211_set_tx_key(struct ieee80211_key *key); void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom); void ieee80211_key_free_unused(struct ieee80211_key *key); void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 800e67615e2a..2b608044ae23 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1051,6 +1051,22 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } } + /* Enable Extended Key IDs when driver allowed it, or when it + * supports neither HW crypto nor A-MPDUs + */ + if ((!local->ops->set_key && + !ieee80211_hw_check(hw, AMPDU_AGGREGATION)) || + ieee80211_hw_check(&local->hw, EXT_KEY_ID_NATIVE)) + wiphy_ext_feature_set(local->hw.wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID); + + /* Mac80211 and therefore all cards only using SW crypto are able to + * handle PTK rekeys correctly + */ + if (!local->ops->set_key) + wiphy_ext_feature_set(local->hw.wiphy, + NL80211_EXT_FEATURE_CAN_REPLACE_PTK0); + /* * Calculate scan IE length -- we need this to alloc * memory and to subtract from the driver limit. It diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 574c3891c4b2..88535a2e62bc 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -278,6 +278,8 @@ mesh_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst); int mesh_path_add_gate(struct mesh_path *mpath); int mesh_path_send_to_gates(struct mesh_path *mpath); int mesh_gate_num(struct ieee80211_sub_if_data *sdata); +u32 airtime_link_metric_get(struct ieee80211_local *local, + struct sta_info *sta); /* Mesh plinks */ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index f7517668e77a..bf8e13cd5fd1 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -318,8 +318,8 @@ void ieee80211s_update_metric(struct ieee80211_local *local, cfg80211_calculate_bitrate(&rinfo)); } -static u32 airtime_link_metric_get(struct ieee80211_local *local, - struct sta_info *sta) +u32 airtime_link_metric_get(struct ieee80211_local *local, + struct sta_info *sta) { /* This should be adjusted for each device */ int device_constant = 1 << ARITH_SHIFT; @@ -1130,16 +1130,17 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, struct mesh_path *mpath; struct sk_buff *skb_to_free = NULL; u8 *target_addr = hdr->addr3; - int err = 0; /* Nulls are only sent to peers for PS and should be pre-addressed */ if (ieee80211_is_qos_nullfunc(hdr->frame_control)) return 0; - rcu_read_lock(); - err = mesh_nexthop_lookup(sdata, skb); - if (!err) - goto endlookup; + /* Allow injected packets to bypass mesh routing */ + if (info->control.flags & IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP) + return 0; + + if (!mesh_nexthop_lookup(sdata, skb)) + return 0; /* no nexthop found, start resolving */ mpath = mesh_path_lookup(sdata, target_addr); @@ -1147,8 +1148,7 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, mpath = mesh_path_add(sdata, target_addr); if (IS_ERR(mpath)) { mesh_path_discard_frame(sdata, skb); - err = PTR_ERR(mpath); - goto endlookup; + return PTR_ERR(mpath); } } @@ -1161,13 +1161,10 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; ieee80211_set_qos_hdr(sdata, skb); skb_queue_tail(&mpath->frame_queue, skb); - err = -ENOENT; if (skb_to_free) mesh_path_discard_frame(sdata, skb_to_free); -endlookup: - rcu_read_unlock(); - return err; + return -ENOENT; } /** @@ -1187,13 +1184,10 @@ int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; - int err = -ENOENT; - rcu_read_lock(); mpath = mesh_path_lookup(sdata, target_addr); - if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) - goto endlookup; + return -ENOENT; if (time_after(jiffies, mpath->exp_time - @@ -1208,12 +1202,10 @@ int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN); memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); ieee80211_mps_set_frame_flags(sdata, next_hop, hdr); - err = 0; + return 0; } -endlookup: - rcu_read_unlock(); - return err; + return -ENOENT; } void mesh_path_timer(struct timer_list *t) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 95eb5064fa91..796b069ad251 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -23,7 +23,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath); static u32 mesh_table_hash(const void *addr, u32 len, u32 seed) { /* Use last four bytes of hw addr as hash index */ - return jhash_1word(*(u32 *)(addr+2), seed); + return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed); } static const struct rhashtable_params mesh_rht_params = { @@ -217,7 +217,7 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst, { struct mesh_path *mpath; - mpath = rhashtable_lookup_fast(&tbl->rhead, dst, mesh_rht_params); + mpath = rhashtable_lookup(&tbl->rhead, dst, mesh_rht_params); if (mpath && mpath_expired(mpath)) { spin_lock_bh(&mpath->state_lock); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2dbcf5d5512e..b7a9fe3d5fcb 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1188,9 +1188,6 @@ static void ieee80211_chswitch_work(struct work_struct *work) goto out; } - /* XXX: shouldn't really modify cfg80211-owned data! */ - ifmgd->associated->channel = sdata->csa_chandef.chan; - ifmgd->csa_waiting_bcn = true; ieee80211_sta_reset_beacon_monitor(sdata); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index ccaf951e4e31..8b168724c5e7 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -51,8 +51,13 @@ MINSTREL_MAX_STREAMS * _sgi + \ _streams - 1 +#define _MAX(a, b) (((a)>(b))?(a):(b)) + +#define GROUP_SHIFT(duration) \ + _MAX(0, 16 - __builtin_clz(duration)) + /* MCS rate information for an MCS group */ -#define MCS_GROUP(_streams, _sgi, _ht40, _s) \ +#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ .shift = _s, \ @@ -72,6 +77,13 @@ } \ } +#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) + +#define MCS_GROUP(_streams, _sgi, _ht40) \ + __MCS_GROUP(_streams, _sgi, _ht40, \ + MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) + #define VHT_GROUP_IDX(_streams, _sgi, _bw) \ (MINSTREL_VHT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * (_bw) + \ @@ -81,7 +93,7 @@ #define BW2VBPS(_bw, r3, r2, r1) \ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) -#define VHT_GROUP(_streams, _sgi, _bw, _s) \ +#define __VHT_GROUP(_streams, _sgi, _bw, _s) \ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ .shift = _s, \ @@ -114,6 +126,14 @@ } \ } +#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ + GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ + BW2VBPS(_bw, 117, 54, 26))) + +#define VHT_GROUP(_streams, _sgi, _bw) \ + __VHT_GROUP(_streams, _sgi, _bw, \ + VHT_GROUP_SHIFT(_streams, _sgi, _bw)) + #define CCK_DURATION(_bitrate, _short, _len) \ (1000 * (10 /* SIFS */ + \ (_short ? 72 + 24 : 144 + 48) + \ @@ -129,7 +149,7 @@ CCK_ACK_DURATION(55, _short) >> _s, \ CCK_ACK_DURATION(110, _short) >> _s -#define CCK_GROUP(_s) \ +#define __CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ .streams = 1, \ .flags = 0, \ @@ -140,6 +160,12 @@ } \ } +#define CCK_GROUP_SHIFT \ + GROUP_SHIFT(CCK_ACK_DURATION(10, false)) + +#define CCK_GROUP __CCK_GROUP(CCK_GROUP_SHIFT) + + static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); MODULE_PARM_DESC(minstrel_vht_only, @@ -154,47 +180,57 @@ MODULE_PARM_DESC(minstrel_vht_only, * BW -> SGI -> #streams */ const struct mcs_group minstrel_mcs_groups[] = { - MCS_GROUP(1, 0, BW_20, 5), - MCS_GROUP(2, 0, BW_20, 4), - MCS_GROUP(3, 0, BW_20, 4), - - MCS_GROUP(1, 1, BW_20, 5), - MCS_GROUP(2, 1, BW_20, 4), - MCS_GROUP(3, 1, BW_20, 4), - - MCS_GROUP(1, 0, BW_40, 4), - MCS_GROUP(2, 0, BW_40, 4), - MCS_GROUP(3, 0, BW_40, 4), - - MCS_GROUP(1, 1, BW_40, 4), - MCS_GROUP(2, 1, BW_40, 4), - MCS_GROUP(3, 1, BW_40, 4), - - CCK_GROUP(8), - - VHT_GROUP(1, 0, BW_20, 5), - VHT_GROUP(2, 0, BW_20, 4), - VHT_GROUP(3, 0, BW_20, 4), - - VHT_GROUP(1, 1, BW_20, 5), - VHT_GROUP(2, 1, BW_20, 4), - VHT_GROUP(3, 1, BW_20, 4), - - VHT_GROUP(1, 0, BW_40, 4), - VHT_GROUP(2, 0, BW_40, 4), - VHT_GROUP(3, 0, BW_40, 4), - - VHT_GROUP(1, 1, BW_40, 4), - VHT_GROUP(2, 1, BW_40, 4), - VHT_GROUP(3, 1, BW_40, 4), - - VHT_GROUP(1, 0, BW_80, 4), - VHT_GROUP(2, 0, BW_80, 4), - VHT_GROUP(3, 0, BW_80, 4), - - VHT_GROUP(1, 1, BW_80, 4), - VHT_GROUP(2, 1, BW_80, 4), - VHT_GROUP(3, 1, BW_80, 4), + MCS_GROUP(1, 0, BW_20), + MCS_GROUP(2, 0, BW_20), + MCS_GROUP(3, 0, BW_20), + MCS_GROUP(4, 0, BW_20), + + MCS_GROUP(1, 1, BW_20), + MCS_GROUP(2, 1, BW_20), + MCS_GROUP(3, 1, BW_20), + MCS_GROUP(4, 1, BW_20), + + MCS_GROUP(1, 0, BW_40), + MCS_GROUP(2, 0, BW_40), + MCS_GROUP(3, 0, BW_40), + MCS_GROUP(4, 0, BW_40), + + MCS_GROUP(1, 1, BW_40), + MCS_GROUP(2, 1, BW_40), + MCS_GROUP(3, 1, BW_40), + MCS_GROUP(4, 1, BW_40), + + CCK_GROUP, + + VHT_GROUP(1, 0, BW_20), + VHT_GROUP(2, 0, BW_20), + VHT_GROUP(3, 0, BW_20), + VHT_GROUP(4, 0, BW_20), + + VHT_GROUP(1, 1, BW_20), + VHT_GROUP(2, 1, BW_20), + VHT_GROUP(3, 1, BW_20), + VHT_GROUP(4, 1, BW_20), + + VHT_GROUP(1, 0, BW_40), + VHT_GROUP(2, 0, BW_40), + VHT_GROUP(3, 0, BW_40), + VHT_GROUP(4, 0, BW_40), + + VHT_GROUP(1, 1, BW_40), + VHT_GROUP(2, 1, BW_40), + VHT_GROUP(3, 1, BW_40), + VHT_GROUP(4, 1, BW_40), + + VHT_GROUP(1, 0, BW_80), + VHT_GROUP(2, 0, BW_80), + VHT_GROUP(3, 0, BW_80), + VHT_GROUP(4, 0, BW_80), + + VHT_GROUP(1, 1, BW_80), + VHT_GROUP(2, 1, BW_80), + VHT_GROUP(3, 1, BW_80), + VHT_GROUP(4, 1, BW_80), }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 26b7a3244b47..f762e5ba7c2e 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -13,7 +13,7 @@ * The number of streams can be changed to 2 to reduce code * size and memory footprint. */ -#define MINSTREL_MAX_STREAMS 3 +#define MINSTREL_MAX_STREAMS 4 #define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */ #define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7f8d93401ce0..25577ede2986 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1005,23 +1005,43 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb) return -1; } -static int ieee80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs, - struct sk_buff *skb) +static int ieee80211_get_keyid(struct sk_buff *skb, + const struct ieee80211_cipher_scheme *cs) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc; int hdrlen; + int minlen; + u8 key_idx_off; + u8 key_idx_shift; u8 keyid; fc = hdr->frame_control; hdrlen = ieee80211_hdrlen(fc); - if (skb->len < hdrlen + cs->hdr_len) + if (cs) { + minlen = hdrlen + cs->hdr_len; + key_idx_off = hdrlen + cs->key_idx_off; + key_idx_shift = cs->key_idx_shift; + } else { + /* WEP, TKIP, CCMP and GCMP */ + minlen = hdrlen + IEEE80211_WEP_IV_LEN; + key_idx_off = hdrlen + 3; + key_idx_shift = 6; + } + + if (unlikely(skb->len < minlen)) return -EINVAL; - skb_copy_bits(skb, hdrlen + cs->key_idx_off, &keyid, 1); - keyid &= cs->key_idx_mask; - keyid >>= cs->key_idx_shift; + skb_copy_bits(skb, key_idx_off, &keyid, 1); + + if (cs) + keyid &= cs->key_idx_mask; + keyid >>= key_idx_shift; + + /* cs could use more than the usual two bits for the keyid */ + if (unlikely(keyid >= NUM_DEFAULT_KEYS)) + return -EINVAL; return keyid; } @@ -1568,7 +1588,15 @@ static void sta_ps_start(struct sta_info *sta) return; for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { - if (txq_has_queue(sta->sta.txq[tid])) + struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct txq_info *txqi = to_txq_info(txq); + + spin_lock(&local->active_txq_lock[txq->ac]); + if (!list_empty(&txqi->schedule_order)) + list_del_init(&txqi->schedule_order); + spin_unlock(&local->active_txq_lock[txq->ac]); + + if (txq_has_queue(txq)) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); @@ -1852,9 +1880,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; int keyidx; - int hdrlen; ieee80211_rx_result result = RX_DROP_UNUSABLE; struct ieee80211_key *sta_ptk = NULL; + struct ieee80211_key *ptk_idx = NULL; int mmie_keyidx = -1; __le16 fc; const struct ieee80211_cipher_scheme *cs = NULL; @@ -1892,21 +1920,24 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->sta) { int keyid = rx->sta->ptk_idx; + sta_ptk = rcu_dereference(rx->sta->ptk[keyid]); - if (ieee80211_has_protected(fc) && rx->sta->cipher_scheme) { + if (ieee80211_has_protected(fc)) { cs = rx->sta->cipher_scheme; - keyid = ieee80211_get_cs_keyid(cs, rx->skb); + keyid = ieee80211_get_keyid(rx->skb, cs); + if (unlikely(keyid < 0)) return RX_DROP_UNUSABLE; + + ptk_idx = rcu_dereference(rx->sta->ptk[keyid]); } - sta_ptk = rcu_dereference(rx->sta->ptk[keyid]); } if (!ieee80211_has_protected(fc)) mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb); if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) { - rx->key = sta_ptk; + rx->key = ptk_idx ? ptk_idx : sta_ptk; if ((status->flag & RX_FLAG_DECRYPTED) && (status->flag & RX_FLAG_IV_STRIPPED)) return RX_CONTINUE; @@ -1966,8 +1997,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) } return RX_CONTINUE; } else { - u8 keyid; - /* * The device doesn't give us the IV so we won't be * able to look up the key. That's ok though, we @@ -1981,23 +2010,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) (status->flag & RX_FLAG_IV_STRIPPED)) return RX_CONTINUE; - hdrlen = ieee80211_hdrlen(fc); + keyidx = ieee80211_get_keyid(rx->skb, cs); - if (cs) { - keyidx = ieee80211_get_cs_keyid(cs, rx->skb); - - if (unlikely(keyidx < 0)) - return RX_DROP_UNUSABLE; - } else { - if (rx->skb->len < 8 + hdrlen) - return RX_DROP_UNUSABLE; /* TODO: count this? */ - /* - * no need to call ieee80211_wep_get_keyidx, - * it verifies a bunch of things we've done already - */ - skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1); - keyidx = keyid >> 6; - } + if (unlikely(keyidx < 0)) + return RX_DROP_UNUSABLE; /* check per-station GTK first, if multicast packet */ if (is_multicast_ether_addr(hdr->addr1) && rx->sta) @@ -4042,12 +4058,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta) case WLAN_CIPHER_SUITE_GCMP_256: break; default: - /* we also don't want to deal with WEP or cipher scheme - * since those require looking up the key idx in the - * frame, rather than assuming the PTK is used - * (we need to revisit this once we implement the real - * PTK index, which is now valid in the spec, but we - * haven't implemented that part yet) + /* We also don't want to deal with + * WEP or cipher scheme. */ goto clear_rcu; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 11f058987a54..a4932ee3595c 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -347,6 +347,15 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; + /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. + * The Tx path starts to use a key as soon as the key slot ptk_idx + * references to is not NULL. To not use the initial Rx-only key + * prematurely for Tx initialize ptk_idx to an impossible PTK keyid + * which always will refer to a NULL key. + */ + BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); + sta->ptk_idx = INVALID_PTK_KEYIDX; + sta->local = local; sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; @@ -2373,6 +2382,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } + + if (ieee80211_vif_is_mesh(&sdata->vif)) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); + sinfo->airtime_link_metric = + airtime_link_metric_get(local, sta); + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index d30690d79a58..24c37f91ca46 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1056,7 +1056,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, /* disable bottom halves when entering the Tx path */ local_bh_disable(); - __ieee80211_subif_start_xmit(skb, dev, flags); + __ieee80211_subif_start_xmit(skb, dev, flags, 0); local_bh_enable(); return ret; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8ba70d26b82e..3bb4459b52c7 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -828,6 +828,36 @@ TRACE_EVENT(drv_sta_state, ) ); +TRACE_EVENT(drv_sta_set_txpwr, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + + TP_ARGS(local, sdata, sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(s16, txpwr) + __field(u8, type) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_ASSIGN; + __entry->txpwr = sta->txpwr.power; + __entry->type = sta->txpwr.type; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " txpwr: %d type %d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, + __entry->txpwr, __entry->type + ) +); + TRACE_EVENT(drv_sta_rc_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h index 366b9e6f043e..40141df09f25 100644 --- a/net/mac80211/trace_msg.h +++ b/net/mac80211/trace_msg.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions of this file + * Copyright (C) 2019 Intel Corporation + */ + #ifdef CONFIG_MAC80211_MESSAGE_TRACING #if !defined(__MAC80211_MSG_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) @@ -11,7 +16,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM mac80211_msg -#define MAX_MSG_LEN 100 +#define MAX_MSG_LEN 120 DECLARE_EVENT_CLASS(mac80211_msg_event, TP_PROTO(struct va_format *vaf), diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8a49a74c0a37..dd220b977025 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1399,11 +1399,15 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; + u32 flow_idx = fq_flow_idx(fq, skb); ieee80211_set_skb_enqueue_time(skb); - fq_tin_enqueue(fq, tin, skb, + + spin_lock_bh(&fq->lock); + fq_tin_enqueue(fq, tin, flow_idx, skb, fq_skb_free_func, fq_flow_get_default_func); + spin_unlock_bh(&fq->lock); } static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, @@ -1590,7 +1594,6 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { - struct fq *fq = &local->fq; struct ieee80211_vif *vif; struct txq_info *txqi; @@ -1608,9 +1611,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, if (!txqi) return false; - spin_lock_bh(&fq->lock); ieee80211_txq_enqueue(local, txqi, skb); - spin_unlock_bh(&fq->lock); schedule_and_wake_txq(local, txqi); @@ -2431,6 +2432,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, * @sdata: virtual interface to build the header for * @skb: the skb to build the header in * @info_flags: skb flags to set + * @ctrl_flags: info control flags to set * * This function takes the skb with 802.3 header and reformats the header to * the appropriate IEEE 802.11 header based on which interface the packet is @@ -2446,7 +2448,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, - struct sta_info *sta) + struct sta_info *sta, u32 ctrl_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; @@ -2470,6 +2472,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, if (IS_ERR(sta)) sta = NULL; +#ifdef CONFIG_MAC80211_DEBUGFS + if (local->force_tx_status) + info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; +#endif + /* convert Ethernet header to proper 802.11 header (based on * operation mode) */ ethertype = (skb->data[12] << 8) | skb->data[13]; @@ -2600,6 +2607,13 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } band = chanctx_conf->def.chan->band; + + /* For injected frames, fill RA right away as nexthop lookup + * will be skipped. + */ + if ((ctrl_flags & IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP) && + is_zero_ether_addr(hdr.addr1)) + memcpy(hdr.addr1, skb->data, ETH_ALEN); break; #endif case NL80211_IFTYPE_STATION: @@ -2818,6 +2832,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, info->flags = info_flags; info->ack_frame_id = info_id; info->band = band; + info->control.flags = ctrl_flags; return skb; free: @@ -3000,23 +3015,15 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) switch (build.key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: - /* add fixed key ID */ - if (gen_iv) { - (build.hdr + build.hdr_len)[3] = - 0x20 | (build.key->conf.keyidx << 6); + if (gen_iv) build.pn_offs = build.hdr_len; - } if (gen_iv || iv_spc) build.hdr_len += IEEE80211_CCMP_HDR_LEN; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* add fixed key ID */ - if (gen_iv) { - (build.hdr + build.hdr_len)[3] = - 0x20 | (build.key->conf.keyidx << 6); + if (gen_iv) build.pn_offs = build.hdr_len; - } if (gen_iv || iv_spc) build.hdr_len += IEEE80211_GCMP_HDR_LEN; break; @@ -3221,6 +3228,8 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, u8 max_subframes = sta->sta.max_amsdu_subframes; int max_frags = local->hw.max_tx_fragments; int max_amsdu_len = sta->sta.max_amsdu_len; + int orig_truesize; + u32 flow_idx; __be16 len; void *data; bool ret = false; @@ -3249,6 +3258,8 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, max_amsdu_len = min_t(int, max_amsdu_len, sta->sta.max_tid_amsdu_len[tid]); + flow_idx = fq_flow_idx(fq, skb); + spin_lock_bh(&fq->lock); /* TODO: Ideally aggregation should be done on dequeue to remain @@ -3256,11 +3267,13 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, */ tin = &txqi->tin; - flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func); + flow = fq_flow_classify(fq, tin, flow_idx, skb, + fq_flow_get_default_func); head = skb_peek_tail(&flow->queue); if (!head || skb_is_gso(head)) goto out; + orig_truesize = head->truesize; orig_len = head->len; if (skb->len + head->len > max_amsdu_len) @@ -3318,6 +3331,7 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, *frag_tail = skb; out_recalc: + fq->memory_usage += head->truesize - orig_truesize; if (head->len != orig_len) { flow->backlog += head->len - orig_len; tin->backlog_bytes += head->len - orig_len; @@ -3383,6 +3397,7 @@ static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, pn = atomic64_inc_return(&key->conf.tx_pn); crypto_hdr[0] = pn; crypto_hdr[1] = pn >> 8; + crypto_hdr[3] = 0x20 | (key->conf.keyidx << 6); crypto_hdr[4] = pn >> 16; crypto_hdr[5] = pn >> 24; crypto_hdr[6] = pn >> 32; @@ -3475,6 +3490,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT; +#ifdef CONFIG_MAC80211_DEBUGFS + if (local->force_tx_status) + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; +#endif + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; *ieee80211_get_qos_ctl(hdr) = tid; @@ -3530,6 +3550,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; +begin: spin_lock_bh(&fq->lock); if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) || @@ -3546,11 +3567,12 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, if (skb) goto out; -begin: skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); if (!skb) goto out; + spin_unlock_bh(&fq->lock); + hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); @@ -3595,8 +3617,11 @@ begin: skb = __skb_dequeue(&tx.skbs); - if (!skb_queue_empty(&tx.skbs)) + if (!skb_queue_empty(&tx.skbs)) { + spin_lock_bh(&fq->lock); skb_queue_splice_tail(&tx.skbs, &txqi->frags); + spin_unlock_bh(&fq->lock); + } } if (skb_has_frag_list(skb) && @@ -3635,6 +3660,7 @@ begin: } IEEE80211_SKB_CB(skb)->control.vif = vif; + return skb; out: spin_unlock_bh(&fq->lock); @@ -3646,16 +3672,17 @@ EXPORT_SYMBOL(ieee80211_tx_dequeue); struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) { struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_txq *ret = NULL; struct txq_info *txqi = NULL; - lockdep_assert_held(&local->active_txq_lock[ac]); + spin_lock_bh(&local->active_txq_lock[ac]); begin: txqi = list_first_entry_or_null(&local->active_txqs[ac], struct txq_info, schedule_order); if (!txqi) - return NULL; + goto out; if (txqi->txq.sta) { struct sta_info *sta = container_of(txqi->txq.sta, @@ -3672,24 +3699,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac) if (txqi->schedule_round == local->schedule_round[ac]) - return NULL; + goto out; list_del_init(&txqi->schedule_order); txqi->schedule_round = local->schedule_round[ac]; - return &txqi->txq; + ret = &txqi->txq; + +out: + spin_unlock_bh(&local->active_txq_lock[ac]); + return ret; } EXPORT_SYMBOL(ieee80211_next_txq); -void ieee80211_return_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq) +void __ieee80211_schedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq, + bool force) { struct ieee80211_local *local = hw_to_local(hw); struct txq_info *txqi = to_txq_info(txq); - lockdep_assert_held(&local->active_txq_lock[txq->ac]); + spin_lock_bh(&local->active_txq_lock[txq->ac]); if (list_empty(&txqi->schedule_order) && - (!skb_queue_empty(&txqi->frags) || txqi->tin.backlog_packets)) { + (force || !skb_queue_empty(&txqi->frags) || + txqi->tin.backlog_packets)) { /* If airtime accounting is active, always enqueue STAs at the * head of the list to ensure that they only get moved to the * back by the airtime DRR scheduler once they have a negative @@ -3706,20 +3739,10 @@ void ieee80211_return_txq(struct ieee80211_hw *hw, list_add_tail(&txqi->schedule_order, &local->active_txqs[txq->ac]); } -} -EXPORT_SYMBOL(ieee80211_return_txq); -void ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq) - __acquires(txq_lock) __releases(txq_lock) -{ - struct ieee80211_local *local = hw_to_local(hw); - - spin_lock_bh(&local->active_txq_lock[txq->ac]); - ieee80211_return_txq(hw, txq); spin_unlock_bh(&local->active_txq_lock[txq->ac]); } -EXPORT_SYMBOL(ieee80211_schedule_txq); +EXPORT_SYMBOL(__ieee80211_schedule_txq); bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct ieee80211_txq *txq) @@ -3729,7 +3752,7 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, struct sta_info *sta; u8 ac = txq->ac; - lockdep_assert_held(&local->active_txq_lock[ac]); + spin_lock_bh(&local->active_txq_lock[ac]); if (!txqi->txq.sta) goto out; @@ -3759,40 +3782,35 @@ bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw, sta->airtime[ac].deficit += sta->airtime_weight; list_move_tail(&txqi->schedule_order, &local->active_txqs[ac]); + spin_unlock_bh(&local->active_txq_lock[ac]); return false; out: if (!list_empty(&txqi->schedule_order)) list_del_init(&txqi->schedule_order); + spin_unlock_bh(&local->active_txq_lock[ac]); return true; } EXPORT_SYMBOL(ieee80211_txq_may_transmit); void ieee80211_txq_schedule_start(struct ieee80211_hw *hw, u8 ac) - __acquires(txq_lock) { struct ieee80211_local *local = hw_to_local(hw); spin_lock_bh(&local->active_txq_lock[ac]); local->schedule_round[ac]++; -} -EXPORT_SYMBOL(ieee80211_txq_schedule_start); - -void ieee80211_txq_schedule_end(struct ieee80211_hw *hw, u8 ac) - __releases(txq_lock) -{ - struct ieee80211_local *local = hw_to_local(hw); - spin_unlock_bh(&local->active_txq_lock[ac]); } -EXPORT_SYMBOL(ieee80211_txq_schedule_end); +EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, - u32 info_flags) + u32 info_flags, + u32 ctrl_flags) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *next; @@ -3806,7 +3824,15 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) goto out_free; - if (!IS_ERR_OR_NULL(sta)) { + if (IS_ERR(sta)) + sta = NULL; + + if (local->ops->wake_tx_queue) { + u16 queue = __ieee80211_select_queue(sdata, sta, skb); + skb_set_queue_mapping(skb, queue); + } + + if (sta) { struct ieee80211_fast_tx *fast_tx; sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); @@ -3855,7 +3881,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb->prev = NULL; skb->next = NULL; - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); + skb = ieee80211_build_hdr(sdata, skb, info_flags, + sta, ctrl_flags); if (IS_ERR(skb)) goto out; @@ -3995,9 +4022,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) - __ieee80211_subif_start_xmit(skb, dev, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0); } else { - __ieee80211_subif_start_xmit(skb, dev, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0); } return NETDEV_TX_OK; @@ -4022,7 +4049,7 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, goto out; } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta); + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0); if (IS_ERR(skb)) goto out; @@ -5059,7 +5086,36 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_mac_header(skb); local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, 0); + local_bh_enable(); + + return 0; +} + +int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + len + + 30 + /* header size */ + 18); /* 11s header size */ + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom); + skb_put_data(skb, buf, len); + + skb->dev = dev; + skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + + local_bh_disable(); + __ieee80211_subif_start_xmit(skb, skb->dev, 0, + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP); local_bh_enable(); return 0; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 4c1655972565..cba4633cd6cf 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -894,10 +894,10 @@ EXPORT_SYMBOL(ieee80211_queue_delayed_work); static u32 _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, struct ieee802_11_elems *elems, - u64 filter, u32 crc, u8 *transmitter_bssid, - u8 *bss_bssid) + u64 filter, u32 crc, + const struct element *check_inherit) { - const struct element *elem, *sub; + const struct element *elem; bool calc_crc = filter != 0; DECLARE_BITMAP(seen_elems, 256); const u8 *ie; @@ -910,6 +910,11 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, u8 elen = elem->datalen; const u8 *pos = elem->data; + if (check_inherit && + !cfg80211_is_element_inherited(elem, + check_inherit)) + continue; + switch (id) { case WLAN_EID_SSID: case WLAN_EID_SUPP_RATES: @@ -1208,57 +1213,6 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; - case WLAN_EID_MULTIPLE_BSSID: - if (!bss_bssid || !transmitter_bssid || elen < 4) - break; - - elems->max_bssid_indicator = pos[0]; - - for_each_element(sub, pos + 1, elen - 1) { - u8 sub_len = sub->datalen; - u8 new_bssid[ETH_ALEN]; - const u8 *index; - - /* - * we only expect the "non-transmitted BSSID - * profile" subelement (subelement id 0) - */ - if (sub->id != 0 || sub->datalen < 4) { - /* not a valid BSS profile */ - continue; - } - - if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || - sub->data[1] != 2) { - /* The first element of the - * Nontransmitted BSSID Profile is not - * the Nontransmitted BSSID Capability - * element. - */ - continue; - } - - /* found a Nontransmitted BSSID Profile */ - index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, - sub->data, sub_len); - if (!index || index[1] < 1 || index[2] == 0) { - /* Invalid MBSSID Index element */ - continue; - } - - cfg80211_gen_new_bssid(transmitter_bssid, - pos[0], - index[2], - new_bssid); - if (ether_addr_equal(new_bssid, bss_bssid)) { - elems->nontransmitted_bssid_profile = - (void *)sub; - elems->bssid_index_len = index[1]; - elems->bssid_index = (void *)&index[2]; - break; - } - } - break; case WLAN_EID_EXTENSION: if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { @@ -1300,26 +1254,108 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return crc; } +static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, + struct ieee802_11_elems *elems, + u8 *transmitter_bssid, + u8 *bss_bssid, + u8 *nontransmitted_profile) +{ + const struct element *elem, *sub; + size_t profile_len = 0; + bool found = false; + + if (!bss_bssid || !transmitter_bssid) + return profile_len; + + for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { + if (elem->datalen < 2) + continue; + + for_each_element(sub, elem->data + 1, elem->datalen - 1) { + u8 new_bssid[ETH_ALEN]; + const u8 *index; + + if (sub->id != 0 || sub->datalen < 4) { + /* not a valid BSS profile */ + continue; + } + + if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || + sub->data[1] != 2) { + /* The first element of the + * Nontransmitted BSSID Profile is not + * the Nontransmitted BSSID Capability + * element. + */ + continue; + } + + memset(nontransmitted_profile, 0, len); + profile_len = cfg80211_merge_profile(start, len, + elem, + sub, + nontransmitted_profile, + len); + + /* found a Nontransmitted BSSID Profile */ + index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, + nontransmitted_profile, + profile_len); + if (!index || index[1] < 1 || index[2] == 0) { + /* Invalid MBSSID Index element */ + continue; + } + + cfg80211_gen_new_bssid(transmitter_bssid, + elem->data[0], + index[2], + new_bssid); + if (ether_addr_equal(new_bssid, bss_bssid)) { + found = true; + elems->bssid_index_len = index[1]; + elems->bssid_index = (void *)&index[2]; + break; + } + } + } + + return found ? profile_len : 0; +} + u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, struct ieee802_11_elems *elems, u64 filter, u32 crc, u8 *transmitter_bssid, u8 *bss_bssid) { + const struct element *non_inherit = NULL; + u8 *nontransmitted_profile; + int nontransmitted_profile_len = 0; + memset(elems, 0, sizeof(*elems)); elems->ie_start = start; elems->total_len = len; + nontransmitted_profile = kmalloc(len, GFP_ATOMIC); + if (nontransmitted_profile) { + nontransmitted_profile_len = + ieee802_11_find_bssid_profile(start, len, elems, + transmitter_bssid, + bss_bssid, + nontransmitted_profile); + non_inherit = + cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + nontransmitted_profile, + nontransmitted_profile_len); + } + crc = _ieee802_11_parse_elems_crc(start, len, action, elems, filter, - crc, transmitter_bssid, bss_bssid); + crc, non_inherit); /* Override with nontransmitted profile, if found */ - if (transmitter_bssid && elems->nontransmitted_bssid_profile) { - const u8 *profile = elems->nontransmitted_bssid_profile; - - _ieee802_11_parse_elems_crc(&profile[2], profile[1], - action, elems, 0, 0, - transmitter_bssid, bss_bssid); - } + if (nontransmitted_profile_len) + _ieee802_11_parse_elems_crc(nontransmitted_profile, + nontransmitted_profile_len, + action, elems, 0, 0, NULL); if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; @@ -1339,6 +1375,8 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; + kfree(nontransmitted_profile); + return crc; } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 5f7c96368b11..6a3187883c4b 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -141,6 +141,42 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, return ieee80211_downgrade_queue(sdata, NULL, skb); } +u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct mac80211_qos_map *qos_map; + bool qos; + + /* all mesh/ocb stations are required to support WME */ + if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT || + sdata->vif.type == NL80211_IFTYPE_OCB) + qos = true; + else if (sta) + qos = sta->sta.wme; + else + qos = false; + + if (!qos) { + skb->priority = 0; /* required for correct WPA/11i MIC */ + return IEEE80211_AC_BE; + } + + if (skb->protocol == sdata->control_port_protocol) { + skb->priority = 7; + goto downgrade; + } + + /* use the data classifier to determine what 802.1d tag the + * data frame has */ + qos_map = rcu_dereference(sdata->qos_map); + skb->priority = cfg80211_classify8021d(skb, qos_map ? + &qos_map->qos_map : NULL); + + downgrade: + return ieee80211_downgrade_queue(sdata, sta, skb); +} + + /* Indicate which queue to use. */ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) @@ -148,10 +184,12 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct sta_info *sta = NULL; const u8 *ra = NULL; - bool qos = false; - struct mac80211_qos_map *qos_map; u16 ret; + /* when using iTXQ, we can do this later */ + if (local->ops->wake_tx_queue) + return 0; + if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) { skb->priority = 0; /* required for correct WPA/11i MIC */ return 0; @@ -161,10 +199,8 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: sta = rcu_dereference(sdata->u.vlan.sta); - if (sta) { - qos = sta->sta.wme; + if (sta) break; - } /* fall through */ case NL80211_IFTYPE_AP: ra = skb->data; @@ -172,56 +208,26 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_WDS: ra = sdata->u.wds.remote_addr; break; -#ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: - qos = true; - break; -#endif case NL80211_IFTYPE_STATION: /* might be a TDLS station */ sta = sta_info_get(sdata, skb->data); if (sta) - qos = sta->sta.wme; + break; ra = sdata->u.mgd.bssid; break; case NL80211_IFTYPE_ADHOC: ra = skb->data; break; - case NL80211_IFTYPE_OCB: - /* all stations are required to support WME */ - qos = true; - break; default: break; } - if (!sta && ra && !is_multicast_ether_addr(ra)) { + if (!sta && ra && !is_multicast_ether_addr(ra)) sta = sta_info_get(sdata, ra); - if (sta) - qos = sta->sta.wme; - } - if (!qos) { - skb->priority = 0; /* required for correct WPA/11i MIC */ - ret = IEEE80211_AC_BE; - goto out; - } + ret = __ieee80211_select_queue(sdata, sta, skb); - if (skb->protocol == sdata->control_port_protocol) { - skb->priority = 7; - goto downgrade; - } - - /* use the data classifier to determine what 802.1d tag the - * data frame has */ - qos_map = rcu_dereference(sdata->qos_map); - skb->priority = cfg80211_classify8021d(skb, qos_map ? - &qos_map->qos_map : NULL); - - downgrade: - ret = ieee80211_downgrade_queue(sdata, sta, skb); - out: rcu_read_unlock(); return ret; } diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h index 80151edc5195..b1b1439cb91b 100644 --- a/net/mac80211/wme.h +++ b/net/mac80211/wme.h @@ -16,6 +16,8 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr); +u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb); u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index f7c544592ec8..baa098291fb0 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -22,8 +22,8 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #endif -#include <net/addrconf.h> -#include <net/nexthop.h> +#include <net/ipv6_stubs.h> +#include <net/rtnh.h> #include "internal.h" /* max memory we will use for mpls_route */ @@ -1223,11 +1223,13 @@ static int mpls_netconf_valid_get_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_mpls_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_mpls_policy, extack); - err = nlmsg_parse_strict(nlh, sizeof(struct netconfmsg), tb, - NETCONFA_MAX, devconf_mpls_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), + tb, NETCONFA_MAX, + devconf_mpls_policy, extack); if (err) return err; @@ -1788,8 +1790,8 @@ static int rtm_to_route_config(struct sk_buff *skb, int index; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); if (err < 0) goto errout; @@ -2017,7 +2019,7 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, u8 linkdown = 0; u8 dead = 0; - mp = nla_nest_start(skb, RTA_MULTIPATH); + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; @@ -2106,8 +2108,8 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, cb->answer_flags = NLM_F_DUMP_FILTERED; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_mpls_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); if (err < 0) return err; @@ -2290,8 +2292,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_mpls_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || @@ -2306,8 +2308,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_mpls_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_mpls_policy, extack); if (err) return err; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index f3a8557494d6..951b52d5835b 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -137,10 +137,14 @@ static int mpls_xmit(struct sk_buff *skb) mpls_stats_inc_outucastpkts(out_dev, skb); - if (rt) - err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, - skb); - else if (rt6) { + if (rt) { + if (rt->rt_gw_family == AF_INET) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, + skb); + else if (rt->rt_gw_family == AF_INET6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, + skb); + } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], @@ -174,8 +178,8 @@ static int mpls_build_state(struct nlattr *nla, u8 n_labels; int ret; - ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, - mpls_iptunnel_policy, extack); + ret = nla_parse_nested_deprecated(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy, extack); if (ret < 0) return ret; diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index bad17bba8ba7..7fc4feddafa3 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -79,7 +79,7 @@ static int ncsi_write_channel_info(struct sk_buff *skb, nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2); nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); - vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); + vid_nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); if (!vid_nest) return -ENOMEM; ncf = &nc->vlan_filter; @@ -113,19 +113,19 @@ static int ncsi_write_package_info(struct sk_buff *skb, NCSI_FOR_EACH_PACKAGE(ndp, np) { if (np->id != id) continue; - pnest = nla_nest_start(skb, NCSI_PKG_ATTR); + pnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR); if (!pnest) return -ENOMEM; nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); - cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); + cnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { nla_nest_cancel(skb, pnest); return -ENOMEM; } NCSI_FOR_EACH_CHANNEL(np, nc) { - nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR); + nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR); if (!nest) { nla_nest_cancel(skb, cnest); nla_nest_cancel(skb, pnest); @@ -187,7 +187,7 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); - attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { kfree_skb(skb); return -EMSGSIZE; @@ -220,8 +220,8 @@ static int ncsi_pkg_info_all_nl(struct sk_buff *skb, void *hdr; int rc; - rc = genlmsg_parse(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, - ncsi_genl_policy, NULL); + rc = genlmsg_parse_deprecated(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, + ncsi_genl_policy, NULL); if (rc) return rc; @@ -250,7 +250,7 @@ static int ncsi_pkg_info_all_nl(struct sk_buff *skb, goto err; } - attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { rc = -EMSGSIZE; goto err; @@ -723,38 +723,38 @@ static int ncsi_set_channel_mask_nl(struct sk_buff *msg, static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_pkg_info_nl, .dumpit = ncsi_pkg_info_all_nl, .flags = 0, }, { .cmd = NCSI_CMD_SET_INTERFACE, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_CLEAR_INTERFACE, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_clear_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SEND_CMD, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_PACKAGE_MASK, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_package_mask_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_CHANNEL_MASK, - .policy = ncsi_genl_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_channel_mask_nl, .flags = GENL_ADMIN_PERM, }, @@ -764,6 +764,7 @@ static struct genl_family ncsi_genl_family __ro_after_init = { .name = "NCSI", .version = 0, .maxattr = NCSI_ATTR_MAX, + .policy = ncsi_genl_policy, .module = THIS_MODULE, .ops = ncsi_ops, .n_ops = ARRAY_SIZE(ncsi_ops), diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index dc07fcc7938e..802db01e3075 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/netdevice.h> +#include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/ncsi.h> @@ -667,7 +668,10 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN); /* Increase mac address by 1 for BMC's address */ - saddr.sa_data[ETH_ALEN - 1]++; + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6548271209a0..02b281d3c167 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -404,11 +404,6 @@ config NF_NAT forms of full Network Address Port Translation. This can be controlled by iptables, ip6tables or nft. -config NF_NAT_NEEDED - bool - depends on NF_NAT - default y - config NF_NAT_AMANDA tristate depends on NF_CONNTRACK && NF_NAT @@ -1002,6 +997,20 @@ config NETFILTER_XT_TARGET_REDIRECT To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on NF_NAT + default m if NETFILTER_ADVANCED=n + select NF_NAT_MASQUERADE + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_TEE tristate '"TEE" - packet cloning to alternate destination' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4894a85cdd0b..72cca6b48960 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -77,7 +77,8 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ + nft_chain_route.o nf_tables_set-objs := nf_tables_set_core.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o @@ -147,6 +148,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_MASQUERADE) += xt_MASQUERADE.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 93aaec3a54ec..b96fd3f54705 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> +#include <net/netfilter/nf_queue.h> #include <net/sock.h> #include "nf_internals.h" @@ -162,7 +163,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, static void hooks_validate(const struct nf_hook_entries *hooks) { -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_MISC struct nf_hook_ops **orig_ops; int prio = INT_MIN; size_t i = 0; diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 257ca393e6f2..38ef2ea838cb 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -99,7 +99,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) struct nlattr *nested; size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || @@ -109,7 +109,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); return 0; nla_put_failure: @@ -213,7 +213,7 @@ mtype_list(const struct ip_set *set, u32 id, first = cb->args[IPSET_CB_ARG0]; int ret = 0; - adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + adt = nla_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; /* Extensions may be replaced */ @@ -230,7 +230,7 @@ mtype_list(const struct ip_set *set, #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); @@ -244,9 +244,9 @@ mtype_list(const struct ip_set *set, goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); } - ipset_nest_end(skb, adt); + nla_nest_end(skb, adt); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; @@ -259,7 +259,7 @@ nla_put_failure: cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } - ipset_nest_end(skb, adt); + nla_nest_end(skb, adt); out: rcu_read_unlock(); return ret; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 45a257695bef..3f4a4936f63c 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -299,8 +299,7 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, - ipaddr_policy, NULL)) + if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -318,8 +317,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, - ipaddr_policy, NULL)) + if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -939,8 +937,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && - nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], - set->type->create_policy, NULL)) { + nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1298,8 +1295,9 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) ip_set_id_t index; /* Second pass, so parser can't fail */ - nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, - ip_set_setname_policy, NULL); + nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, ip_set_setname_policy, + NULL); cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { @@ -1546,8 +1544,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL); + nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, + ip_set_adt_policy, NULL); errline = nla_data(cda[IPSET_ATTR_LINENO]); @@ -1592,9 +1591,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, - attr[IPSET_ATTR_DATA], - set->type->adt_policy, NULL)) + if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1605,8 +1602,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, - set->type->adt_policy, NULL)) + nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1647,9 +1643,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, use_lineno = !!attr[IPSET_ATTR_LINENO]; if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, - attr[IPSET_ATTR_DATA], - set->type->adt_policy, NULL)) + if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, use_lineno); @@ -1660,8 +1654,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, memset(tb, 0, sizeof(*tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || - nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, - set->type->adt_policy, NULL)) + nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, use_lineno); @@ -1692,8 +1685,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!set) return -ENOENT; - if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], - set->type->adt_policy, NULL)) + if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 2c9609929c71..01d51f775f12 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1057,7 +1057,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htable_bits = t->htable_bits; rcu_read_unlock_bh(); - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE, @@ -1079,7 +1079,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); return 0; nla_put_failure: @@ -1124,7 +1124,7 @@ mtype_list(const struct ip_set *set, void *incomplete; int i, ret = 0; - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; @@ -1150,7 +1150,7 @@ mtype_list(const struct ip_set *set, continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (cb->args[IPSET_CB_ARG0] == first) { nla_nest_cancel(skb, atd); @@ -1163,10 +1163,10 @@ mtype_list(const struct ip_set *set, goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); } } - ipset_nest_end(skb, atd); + nla_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; @@ -1180,7 +1180,7 @@ nla_put_failure: cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } else { - ipset_nest_end(skb, atd); + nla_nest_end(skb, atd); } out: rcu_read_unlock(); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 8da228da53ae..4f894165cdcd 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -466,7 +466,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) struct nlattr *nested; size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size; - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || @@ -476,7 +476,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); return 0; nla_put_failure: @@ -494,7 +494,7 @@ list_set_list(const struct ip_set *set, struct set_elem *e; int ret = 0; - atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; @@ -506,7 +506,7 @@ list_set_list(const struct ip_set *set, i++; continue; } - nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; ip_set_name_byindex(map->net, e->id, name); @@ -514,11 +514,11 @@ list_set_list(const struct ip_set *set, goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; - ipset_nest_end(skb, nested); + nla_nest_end(skb, nested); i++; } - ipset_nest_end(skb, atd); + nla_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; @@ -531,7 +531,7 @@ nla_put_failure: ret = -EMSGSIZE; } else { cb->args[IPSET_CB_ARG0] = i; - ipset_nest_end(skb, atd); + nla_nest_end(skb, atd); } out: rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 43bbaa32b1d6..14457551bcb4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (!cp) { int v; - if (!sysctl_schedule_icmp(ipvs)) + if (ipip || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 053cd96b9c76..0e887159425c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -831,6 +831,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* set the tunnel info */ + dest->tun_type = udest->tun_type; + dest->tun_port = udest->tun_port; + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -987,6 +991,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -1051,6 +1062,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } + if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if (udest->tun_port == 0) { + pr_err("%s(): tunnel port is zero\n", __func__); + return -EINVAL; + } + } + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ @@ -2333,6 +2351,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; + udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int @@ -2890,12 +2909,14 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, + [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { - struct nlattr *nl_stats = nla_nest_start(skb, container_type); + struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; @@ -2925,7 +2946,7 @@ nla_put_failure: static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { - struct nlattr *nl_stats = nla_nest_start(skb, container_type); + struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; @@ -2971,7 +2992,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_kstats kstats; char *sched_name; - nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) return -EMSGSIZE; @@ -3095,8 +3116,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, /* Parse mandatory identifying service fields first */ if (nla == NULL || - nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, - ip_vs_svc_policy, NULL)) + nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; @@ -3182,7 +3202,7 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) struct nlattr *nl_dest; struct ip_vs_kstats kstats; - nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) return -EMSGSIZE; @@ -3193,6 +3213,10 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || + nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, + dest->tun_type) || + nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, + dest->tun_port) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3254,8 +3278,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); /* Try to find the service for which to dump destinations */ - if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, - ip_vs_cmd_policy, cb->extack)) + if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) goto out_err; @@ -3291,8 +3314,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* Parse mandatory identifying destination fields first */ if (nla == NULL || - nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, - ip_vs_dest_policy, NULL)) + nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; @@ -3315,12 +3337,14 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh; + *nla_l_thresh, *nla_tun_type, *nla_tun_port; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; + nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3330,6 +3354,12 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); + + if (nla_tun_type) + udest->tun_type = nla_get_u8(nla_tun_type); + + if (nla_tun_port) + udest->tun_port = nla_get_be16(nla_tun_port); } return 0; @@ -3340,7 +3370,7 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, { struct nlattr *nl_daemon; - nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); if (!nl_daemon) return -EMSGSIZE; @@ -3528,9 +3558,7 @@ static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || - nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, - info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy, info->extack)) + nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) @@ -3774,94 +3802,98 @@ out: static const struct genl_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_SERVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, .dumpit = ip_vs_genl_dump_services, - .policy = ip_vs_cmd_policy, }, { .cmd = IPVS_CMD_NEW_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_DEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .dumpit = ip_vs_genl_dump_dests, }, { .cmd = IPVS_CMD_NEW_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_daemons, }, { .cmd = IPVS_CMD_SET_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_GET_INFO, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_ZERO, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = ip_vs_cmd_policy, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, @@ -3872,6 +3904,7 @@ static struct genl_family ip_vs_genl_family __ro_after_init = { .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_ATTR_MAX, + .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .ops = ip_vs_genl_ops, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 175349fcf91f..8d6f94b67772 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -32,6 +32,7 @@ #include <linux/slab.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> +#include <net/gue.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -382,6 +383,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); } else { mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -533,6 +538,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst); else { mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (!dest) + goto err_put; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -989,6 +998,41 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af) } } +static int +ipvs_gue_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 dport; + __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); + struct udphdr *udph; /* Our new UDP header */ + struct guehdr *gueh; /* Our new GUE header */ + + skb_push(skb, sizeof(struct guehdr)); + + gueh = (struct guehdr *)skb->data; + + gueh->control = 0; + gueh->version = 0; + gueh->hlen = 0; + gueh->flags = 0; + gueh->proto_ctype = *next_protocol; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + + dport = cp->dest->tun_port; + udph->dest = dport; + udph->source = sport; + udph->len = htons(skb->len); + udph->check = 0; + + *next_protocol = IPPROTO_UDP; + + return 0; +} + /* * IP Tunneling transmitter * @@ -1025,6 +1069,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); @@ -1046,6 +1091,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, @@ -1054,11 +1104,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1102,6 +1161,8 @@ int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { + struct netns_ipvs *ipvs = cp->ipvs; + struct net *net = ipvs->net; struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ @@ -1112,10 +1173,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int ret, local; + int tun_type, gso_type; EnterFunction(10); - local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, + local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, &cp->daddr.in6, &saddr, ipvsh, 1, IP_VS_RT_MODE_LOCAL | @@ -1134,17 +1196,31 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + tun_type = cp->dest->tun_type; + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, &dsfield, &ttl, NULL); if (IS_ERR(skb)) goto tx_error; - if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af))) + gso_type = __tun_gso_type_mask(AF_INET6, cp->af); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + gso_type |= SKB_GSO_UDP_TUNNEL; + + if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; skb->transport_header = skb->network_header; + skb_set_inner_ipproto(skb, next_protocol); + + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + ipvs_gue_encap(net, skb, cp, &next_protocol); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1167,7 +1243,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ret = ip_vs_tunnel_xmit_prepare(skb, cp); if (ret == NF_ACCEPT) - ip6_local_out(cp->ipvs->net, skb->sk, skb); + ip6_local_out(net, skb->sk, skb); else if (ret == NF_DROP) kfree_skb(skb); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index f2681ec5b5f6..dbec6fca0d9e 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -28,11 +28,13 @@ static unsigned int master_timeout __read_mostly = 300; static char *ts_algo = "kmp"; +#define HELPER_NAME "amanda" + MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_amanda"); -MODULE_ALIAS_NFCT_HELPER("amanda"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); @@ -179,13 +181,14 @@ static const struct nf_conntrack_expect_policy amanda_exp_policy = { static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { { - .name = "amanda", + .name = HELPER_NAME, .me = THIS_MODULE, .help = amanda_help, .tuple.src.l3num = AF_INET, .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, .expect_policy = &amanda_exp_policy, + .nat_mod_name = NF_NAT_HELPER_NAME(HELPER_NAME), }, { .name = "amanda", @@ -195,6 +198,7 @@ static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { .tuple.src.u.udp.port = cpu_to_be16(10080), .tuple.dst.protonum = IPPROTO_UDP, .expect_policy = &amanda_exp_policy, + .nat_mod_name = NF_NAT_HELPER_NAME(HELPER_NAME), }, }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 82bfbeef46af..2a714527cde1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> @@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); +/* Generate a almost-unique pseudo-id for a given conntrack. + * + * intentionally doesn't re-use any of the seeds used for hash + * table location, we assume id gets exposed to userspace. + * + * Following nf_conn items do not change throughout lifetime + * of the nf_conn after it has been committed to main hash table: + * + * 1. nf_conn address + * 2. nf_conn->ext address + * 3. nf_conn->master address (normally NULL) + * 4. tuple + * 5. the associated net namespace + */ +u32 nf_ct_get_id(const struct nf_conn *ct) +{ + static __read_mostly siphash_key_t ct_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); + + a = (unsigned long)ct; + b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); + c = (unsigned long)ct->ext; + d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), + &ct_id_seed); +#ifdef CONFIG_64BIT + return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); +#else + return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); +#endif +} +EXPORT_SYMBOL_GPL(nf_ct_get_id); + static void clean_from_lists(struct nf_conn *ct) { @@ -982,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); - if (tstamp) { - if (skb->tstamp == 0) - __net_timestamp(skb); + if (tstamp) + tstamp->start = ktime_get_real_ns(); - tstamp->start = ktime_to_ns(skb->tstamp); - } /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above @@ -1350,6 +1382,7 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; + ct->timeout = 0; write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 334d6e5b7762..59c18804a10a 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -336,7 +336,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, exp->tuple.dst.u.all = *dst; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index a11c304fb771..32aeac1c4760 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -29,11 +29,13 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <linux/netfilter/nf_conntrack_ftp.h> +#define HELPER_NAME "ftp" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); -MODULE_ALIAS_NFCT_HELPER("ftp"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); /* This is slow, but it's simple. --RR */ static char *ftp_buffer; @@ -588,12 +590,14 @@ static int __init nf_conntrack_ftp_init(void) /* FIXME should be configurable whether IPv4 and IPv6 FTP connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp", - FTP_PORT, ports[i], ports[i], &ftp_exp_policy, - 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE); - nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp", - FTP_PORT, ports[i], ports[i], &ftp_exp_policy, - 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE); + nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, + HELPER_NAME, FTP_PORT, ports[i], ports[i], + &ftp_exp_policy, 0, help, + nf_ct_ftp_from_nlattr, THIS_MODULE); + nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, + HELPER_NAME, FTP_PORT, ports[i], ports[i], + &ftp_exp_policy, 0, help, + nf_ct_ftp_from_nlattr, THIS_MODULE); } ret = nf_conntrack_helpers_register(ftp, ports_c * 2); diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 1601275efe2d..4c2ef42e189c 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -172,7 +172,7 @@ static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) if (bits % BITS_PER_BYTE > 0) bytes++; - if (*bs->cur + bytes > *bs->end) + if (bs->cur + bytes > bs->end) return 1; return 0; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 005589c6d0f6..12de40390e97 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -748,24 +748,19 @@ static int callforward_do_filter(struct net *net, } break; } -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { - const struct nf_ipv6_ops *v6ops; struct rt6_info *rt1, *rt2; struct flowi6 fl1, fl2; - v6ops = nf_get_ipv6_ops(); - if (!v6ops) - return 0; - memset(&fl1, 0, sizeof(fl1)); fl1.daddr = src->in6; memset(&fl2, 0, sizeof(fl2)); fl2.daddr = dst->in6; - if (!v6ops->route(net, (struct dst_entry **)&rt1, + if (!nf_ip6_route(net, (struct dst_entry **)&rt1, flowi6_to_flowi(&fl1), false)) { - if (!v6ops->route(net, (struct dst_entry **)&rt2, + if (!nf_ip6_route(net, (struct dst_entry **)&rt2, flowi6_to_flowi(&fl2), false)) { if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr), rt6_nexthop(rt2, &fl2.daddr)) && diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 274baf1dab87..918df7f71c8f 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -42,6 +42,9 @@ module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, "Enable automatic conntrack helper assignment (default 0)"); +static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); +static struct list_head nf_ct_nat_helpers __read_mostly; + /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -130,6 +133,70 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); +static struct nf_conntrack_nat_helper * +nf_conntrack_nat_helper_find(const char *mod_name) +{ + struct nf_conntrack_nat_helper *cur; + bool found = false; + + list_for_each_entry_rcu(cur, &nf_ct_nat_helpers, list) { + if (!strcmp(cur->mod_name, mod_name)) { + found = true; + break; + } + } + return found ? cur : NULL; +} + +int +nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + struct nf_conntrack_nat_helper *nat; + char mod_name[NF_CT_HELPER_NAME_LEN]; + int ret = 0; + + rcu_read_lock(); + h = __nf_conntrack_helper_find(name, l3num, protonum); + if (!h) { + rcu_read_unlock(); + return -ENOENT; + } + + nat = nf_conntrack_nat_helper_find(h->nat_mod_name); + if (!nat) { + snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name); + rcu_read_unlock(); + request_module(mod_name); + + rcu_read_lock(); + nat = nf_conntrack_nat_helper_find(mod_name); + if (!nat) { + rcu_read_unlock(); + return -ENOENT; + } + } + + if (!try_module_get(nat->module)) + ret = -ENOENT; + + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_helper_try_module_get); + +void nf_nat_helper_put(struct nf_conntrack_helper *helper) +{ + struct nf_conntrack_nat_helper *nat; + + nat = nf_conntrack_nat_helper_find(helper->nat_mod_name); + if (WARN_ON_ONCE(!nat)) + return; + + module_put(nat->module); +} +EXPORT_SYMBOL_GPL(nf_nat_helper_put); + struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { @@ -430,6 +497,8 @@ void nf_ct_helper_init(struct nf_conntrack_helper *helper, helper->help = help; helper->from_nlattr = from_nlattr; helper->me = module; + snprintf(helper->nat_mod_name, sizeof(helper->nat_mod_name), + NF_NAT_HELPER_PREFIX "%s", name); if (spec_port == default_port) snprintf(helper->name, sizeof(helper->name), "%s", name); @@ -466,6 +535,22 @@ void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); +void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat) +{ + mutex_lock(&nf_ct_nat_helpers_mutex); + list_add_rcu(&nat->list, &nf_ct_nat_helpers); + mutex_unlock(&nf_ct_nat_helpers_mutex); +} +EXPORT_SYMBOL_GPL(nf_nat_helper_register); + +void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) +{ + mutex_lock(&nf_ct_nat_helpers_mutex); + list_del_rcu(&nat->list); + mutex_unlock(&nf_ct_nat_helpers_mutex); +} +EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); + static const struct nf_ct_ext_type helper_extend = { .len = sizeof(struct nf_conn_help), .align = __alignof__(struct nf_conn_help), @@ -493,6 +578,7 @@ int nf_conntrack_helper_init(void) goto out_extend; } + INIT_LIST_HEAD(&nf_ct_nat_helpers); return 0; out_extend: kvfree(nf_ct_helper_hash); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 4099f4d79bae..79e5014b3b0d 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -42,11 +42,13 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, struct nf_conntrack_expect *exp) __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_irc_hook); +#define HELPER_NAME "irc" + MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_irc"); -MODULE_ALIAS_NFCT_HELPER("irc"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); @@ -259,7 +261,7 @@ static int __init nf_conntrack_irc_init(void) ports[ports_c++] = IRC_PORT; for (i = 0; i < ports_c; i++) { - nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc", + nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, HELPER_NAME, IRC_PORT, ports[i], i, &irc_exp_policy, 0, help, NULL, THIS_MODULE); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 66c596d287a5..7db79c1b8084 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -29,6 +29,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> +#include <linux/siphash.h> #include <linux/netfilter.h> #include <net/netlink.h> @@ -45,7 +46,7 @@ #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #endif @@ -62,7 +63,7 @@ static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, int ret = 0; struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) @@ -103,7 +104,7 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, int ret = 0; struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_IP); if (!nest_parms) goto nla_put_failure; @@ -186,7 +187,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) if (!l4proto->to_nlattr) return 0; - nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED); + nest_proto = nla_nest_start(skb, CTA_PROTOINFO); if (!nest_proto) goto nla_put_failure; @@ -214,7 +215,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb, if (!helper) goto out; - nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); + nest_helper = nla_nest_start(skb, CTA_HELP); if (!nest_helper) goto nla_put_failure; if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) @@ -248,7 +249,7 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, bytes = atomic64_read(&counter[dir].bytes); } - nest_count = nla_nest_start(skb, attr | NLA_F_NESTED); + nest_count = nla_nest_start(skb, attr); if (!nest_count) goto nla_put_failure; @@ -292,7 +293,7 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) if (!tstamp) return 0; - nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + nest_count = nla_nest_start(skb, CTA_TIMESTAMP); if (!nest_count) goto nla_put_failure; @@ -336,7 +337,7 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) return 0; ret = -1; - nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + nest_secctx = nla_nest_start(skb, CTA_SECCTX); if (!nest_secctx) goto nla_put_failure; @@ -396,7 +397,7 @@ static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) if (!(ct->status & IPS_EXPECTED)) return 0; - nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) @@ -414,7 +415,7 @@ dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; @@ -466,7 +467,7 @@ static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) if (!synproxy) return 0; - nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_SYNPROXY); if (!nest_parms) goto nla_put_failure; @@ -485,7 +486,9 @@ nla_put_failure: static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) + __be32 id = (__force __be32)nf_ct_get_id(ct); + + if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; @@ -525,7 +528,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, zone = nf_ct_zone(ct); - nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) @@ -535,7 +538,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, goto nla_put_failure; nla_nest_end(skb, nest_parms); - nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) @@ -655,7 +658,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif @@ -717,7 +720,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) zone = nf_ct_zone(ct); - nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) @@ -727,7 +730,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; nla_nest_end(skb, nest_parms); - nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) @@ -1017,12 +1020,12 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); if (ret < 0) return ret; - ret = nla_validate_nested(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); + ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, + cta_ip_nla_policy, NULL); if (ret) return ret; @@ -1049,8 +1052,8 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nlattr *tb[CTA_PROTO_MAX+1]; int ret = 0; - ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy, - NULL); + ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr, + proto_nla_policy, NULL); if (ret < 0) return ret; @@ -1062,8 +1065,9 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, l4proto = nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { - ret = nla_validate_nested(attr, CTA_PROTO_MAX, - l4proto->nla_policy, NULL); + ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX, + l4proto->nla_policy, + NULL); if (ret == 0) ret = l4proto->nlattr_to_tuple(tb, tuple); } @@ -1126,8 +1130,8 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], memset(tuple, 0, sizeof(*tuple)); - err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy, - NULL); + err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type], + tuple_nla_policy, NULL); if (err < 0) return err; @@ -1177,7 +1181,8 @@ static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, int err; struct nlattr *tb[CTA_HELP_MAX+1]; - err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr, + help_nla_policy, NULL); if (err < 0) return err; @@ -1251,7 +1256,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, struct nf_conntrack_tuple tuple; struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; struct nf_conntrack_zone zone; int err; @@ -1286,8 +1291,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_ID]) { - u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); - if (id != (u32)(unsigned long)ct) { + __be32 id = nla_get_be32(cda[CTA_ID]); + + if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } @@ -1494,7 +1500,7 @@ static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, return -EOPNOTSUPP; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, @@ -1586,7 +1592,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) static int ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) int ret; if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) @@ -1717,8 +1723,8 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, struct nlattr *tb[CTA_PROTOINFO_MAX+1]; int err = 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, - NULL); + err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr, + protoinfo_policy, NULL); if (err < 0) return err; @@ -1741,7 +1747,8 @@ static int change_seq_adj(struct nf_ct_seqadj *seq, int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; - err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL); + err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr, + seqadj_policy, NULL); if (err < 0) return err; @@ -1818,8 +1825,9 @@ static int ctnetlink_change_synproxy(struct nf_conn *ct, if (!synproxy) return 0; - err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], - synproxy_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX, + cda[CTA_SYNPROXY], synproxy_policy, + NULL); if (err < 0) return err; @@ -2369,7 +2377,7 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif @@ -2396,7 +2404,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) zone = nf_ct_zone(ct); - nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) @@ -2406,7 +2414,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; nla_nest_end(skb, nest_parms); - nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) @@ -2468,7 +2476,7 @@ ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, { struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, ct_attr); if (!nest_parms) goto nla_put_failure; @@ -2549,7 +2557,8 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) struct nlattr *cda[CTA_MAX+1]; int ret; - ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy, NULL); + ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy, + NULL); if (ret < 0) return ret; @@ -2582,8 +2591,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, struct nf_conntrack_expect *exp; int err; - err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy, - NULL); + err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr, + exp_nla_policy, NULL); if (err < 0) return err; @@ -2640,7 +2649,7 @@ static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, { struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, tuple) < 0) @@ -2667,7 +2676,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, m.src.u.all = mask->src.u.all; m.dst.protonum = tuple->dst.protonum; - nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); if (!nest_parms) goto nla_put_failure; @@ -2692,6 +2701,25 @@ nla_put_failure: static const union nf_inet_addr any_addr; +static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) +{ + static __read_mostly siphash_key_t exp_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); + + a = (unsigned long)exp; + b = (unsigned long)exp->helper; + c = (unsigned long)exp->master; + d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); + +#ifdef CONFIG_64BIT + return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); +#else + return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); +#endif +} + static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) @@ -2699,7 +2727,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conn_help *help; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; #endif @@ -2717,10 +2745,10 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || exp->saved_proto.all) { - nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT); if (!nest_parms) goto nla_put_failure; @@ -2739,7 +2767,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || - nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || + nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; @@ -3044,7 +3072,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } @@ -3180,13 +3209,13 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_expect *exp, u_int8_t u3) { -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; struct nf_conntrack_tuple nat_tuple = {}; int err; - err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, - exp_nat_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr, + exp_nat_nla_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b9403a266a2e..37bb530d848f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, struct va_format vaf; va_list args; - if (net->ct.sysctl_log_invalid != protonum || + if (net->ct.sysctl_log_invalid != protonum && net->ct.sysctl_log_invalid != IPPROTO_RAW) return; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 6fca80587505..7491aa4c3566 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -598,7 +598,7 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nlattr *nest_parms; spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) || @@ -639,8 +639,8 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) if (!attr) return 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, + dccp_nla_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 7df477996b16..9becac953587 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, - const struct nf_hook_state *state) +/* Check inner header is related to any of the existing connections */ +int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state, + u8 l4proto, union nf_inet_addr *outer_daddr) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + union nf_inet_addr *ct_daddr; + enum ip_conntrack_dir dir; + struct nf_conn *ct; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, state->net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); + if (!nf_ct_get_tuplepr(skb, dataoff, + state->pf, state->net, &origtuple)) return -NF_ACCEPT; - } /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple)) { - pr_debug("icmp_error_message: no match\n"); + if (!nf_ct_invert_tuple(&innertuple, &origtuple)) return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(state->net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); + if (!h) + return -NF_ACCEPT; + + /* Consider: A -> T (=This machine) -> B + * Conntrack entry will look like this: + * Original: A->B + * Reply: B->T (SNAT case) OR A + * + * When this function runs, we got packet that looks like this: + * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..). + * + * Above nf_conntrack_find_get() makes lookup based on inner_hdr, + * so we should expect that destination of the found connection + * matches outer header destination address. + * + * In above example, we can consider these two cases: + * 1. Error coming in reply direction from B or M (middle box) to + * T (SNAT case) or A. + * Inner saddr will be B, dst will be T or A. + * The found conntrack will be reply tuple (B->T/A). + * 2. Error coming in original direction from A or M to B. + * Inner saddr will be A, inner daddr will be B. + * The found conntrack will be original tuple (A->B). + * + * In both cases, conntrack[dir].dst == inner.dst. + * + * A bogus packet could look like this: + * Inner: B->T + * Outer: B->X (other machine reachable by T). + * + * In this case, lookup yields connection A->B and will + * set packet from B->X as *RELATED*, even though no connection + * from X was ever seen. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + dir = NF_CT_DIRECTION(h); + ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; + if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { + if (state->pf == AF_INET) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI4 != inner %pI4", + &outer_daddr->ip, &ct_daddr->ip); + } else if (state->pf == AF_INET6) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI6 != inner %pI6", + &outer_daddr->ip6, &ct_daddr->ip6); + } + nf_ct_put(ct); return -NF_ACCEPT; } - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo = IP_CT_RELATED; + if (dir == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + nf_ct_set(skb, ct, ctinfo); return NF_ACCEPT; } @@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; @@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(tmpl, skb, state); + memset(&outer_daddr, 0, sizeof(outer_daddr)); + outer_daddr.ip = ip_hdr(skb)->daddr; + + dataoff += sizeof(*icmph); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMP, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index bec4a3211658..c63ee3612855 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, return NF_ACCEPT; } -static int -icmpv6_error_message(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int icmp6off) -{ - struct nf_conntrack_tuple intuple, origtuple; - const struct nf_conntrack_tuple_hash *h; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) - + sizeof(struct ipv6hdr) - + sizeof(struct icmp6hdr), - PF_INET6, net, &origtuple)) { - pr_debug("icmpv6_error: Can't get tuple\n"); - return -NF_ACCEPT; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple)) { - pr_debug("icmpv6_error: Can't invert tuple\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), - &intuple); - if (!h) { - pr_debug("icmpv6_error: no match\n"); - return -NF_ACCEPT; - } else { - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, @@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; @@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(state->net, tmpl, skb, dataoff); + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += sizeof(*icmp6h); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index a7818101ad80..5b8dde266412 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -520,7 +520,7 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nlattr *nest_parms; spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; @@ -563,8 +563,8 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) if (!attr) return 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_SCTP_MAX, attr, - sctp_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, + sctp_nla_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index a06875a466a4..7ba01d8ee165 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1192,7 +1192,7 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_ct_tcp_flags tmp = {}; spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP); if (!nest_parms) goto nla_put_failure; @@ -1248,8 +1248,8 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) if (!pattr) return 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, - tcp_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr, + tcp_nla_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 5072ff96ab33..83306648dd0f 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -30,10 +30,12 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_sane.h> +#define HELPER_NAME "sane" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>"); MODULE_DESCRIPTION("SANE connection tracking helper"); -MODULE_ALIAS_NFCT_HELPER("sane"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); static char *sane_buffer; @@ -195,12 +197,12 @@ static int __init nf_conntrack_sane_init(void) /* FIXME should be configurable whether IPv4 and IPv6 connections are tracked or not - YK */ for (i = 0; i < ports_c; i++) { - nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane", - SANE_PORT, ports[i], ports[i], + nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, + HELPER_NAME, SANE_PORT, ports[i], ports[i], &sane_exp_policy, 0, help, NULL, THIS_MODULE); - nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane", - SANE_PORT, ports[i], ports[i], + nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, + HELPER_NAME, SANE_PORT, ports[i], ports[i], &sane_exp_policy, 0, help, NULL, THIS_MODULE); } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 39fcc1ed18f3..c30c883c370b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -30,11 +30,13 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_sip.h> +#define HELPER_NAME "sip" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); MODULE_DESCRIPTION("SIP connection tracking helper"); MODULE_ALIAS("ip_conntrack_sip"); -MODULE_ALIAS_NFCT_HELPER("sip"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; @@ -928,7 +930,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff, nfct_help(exp->master)->helper != nfct_help(ct)->helper || exp->class != class) break; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (!direct_rtp && (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) || exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && @@ -1669,21 +1671,21 @@ static int __init nf_conntrack_sip_init(void) ports[ports_c++] = SIP_PORT; for (i = 0; i < ports_c; i++) { - nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", - SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, sip_help_udp, + nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, + HELPER_NAME, SIP_PORT, ports[i], i, + sip_exp_policy, SIP_EXPECT_MAX, sip_help_udp, NULL, THIS_MODULE); - nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip", - SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, sip_help_tcp, + nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, + HELPER_NAME, SIP_PORT, ports[i], i, + sip_exp_policy, SIP_EXPECT_MAX, sip_help_tcp, NULL, THIS_MODULE); - nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip", - SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, sip_help_udp, + nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, + HELPER_NAME, SIP_PORT, ports[i], i, + sip_exp_policy, SIP_EXPECT_MAX, sip_help_udp, NULL, THIS_MODULE); - nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip", - SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, sip_help_tcp, + nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, + HELPER_NAME, SIP_PORT, ports[i], i, + sip_exp_policy, SIP_EXPECT_MAX, sip_help_tcp, NULL, THIS_MODULE); } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index c2ae14c720b4..e0d392cb3075 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -511,6 +511,8 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) /* Log invalid packets of a given protocol */ static int log_invalid_proto_min __read_mostly; static int log_invalid_proto_max __read_mostly = 255; +static int zero; +static int one = 1; /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; @@ -624,9 +626,11 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", @@ -647,33 +651,41 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", .data = &init_net.ct.sysctl_auto_assign_helper, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = { @@ -744,15 +756,19 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", @@ -887,7 +903,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .procname = "nf_conntrack_dccp_loose", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_NF_CT_PROTO_GRE diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 548b673b3625..6977cb91ae9a 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -20,11 +20,13 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <linux/netfilter/nf_conntrack_tftp.h> +#define HELPER_NAME "tftp" + MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); MODULE_DESCRIPTION("TFTP connection tracking helper"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ip_conntrack_tftp"); -MODULE_ALIAS_NFCT_HELPER("tftp"); +MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); #define MAX_PORTS 8 static unsigned short ports[MAX_PORTS]; @@ -119,12 +121,14 @@ static int __init nf_conntrack_tftp_init(void) ports[ports_c++] = TFTP_PORT; for (i = 0; i < ports_c; i++) { - nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp", - TFTP_PORT, ports[i], i, &tftp_exp_policy, - 0, tftp_help, NULL, THIS_MODULE); - nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp", - TFTP_PORT, ports[i], i, &tftp_exp_policy, - 0, tftp_help, NULL, THIS_MODULE); + nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, + HELPER_NAME, TFTP_PORT, ports[i], i, + &tftp_exp_policy, 0, tftp_help, NULL, + THIS_MODULE); + nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, + HELPER_NAME, TFTP_PORT, ports[i], i, + &tftp_exp_policy, 0, tftp_help, NULL, + THIS_MODULE); } ret = nf_conntrack_helpers_register(tftp, ports_c * 2); diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 91fbd183da2d..edac8ea4436d 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -48,6 +48,95 @@ void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) } EXPORT_SYMBOL_GPL(nf_ct_untimeout); +static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout) +{ + typeof(nf_ct_timeout_put_hook) timeout_put; + + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + if (timeout_put) + timeout_put(timeout); +} + +int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, + u8 l3num, u8 l4num, const char *timeout_name) +{ + typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + struct nf_ct_timeout *timeout; + struct nf_conn_timeout *timeout_ext; + const char *errmsg = NULL; + int ret = 0; + + rcu_read_lock(); + timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); + if (!timeout_find_get) { + ret = -ENOENT; + errmsg = "Timeout policy base is empty"; + goto out; + } + + timeout = timeout_find_get(net, timeout_name); + if (!timeout) { + ret = -ENOENT; + pr_info_ratelimited("No such timeout policy \"%s\"\n", + timeout_name); + goto out; + } + + if (timeout->l3num != l3num) { + ret = -EINVAL; + pr_info_ratelimited("Timeout policy `%s' can only be used by " + "L%d protocol number %d\n", + timeout_name, 3, timeout->l3num); + goto err_put_timeout; + } + /* Make sure the timeout policy matches any existing protocol tracker, + * otherwise default to generic. + */ + if (timeout->l4proto->l4proto != l4num) { + ret = -EINVAL; + pr_info_ratelimited("Timeout policy `%s' can only be used by " + "L%d protocol number %d\n", + timeout_name, 4, timeout->l4proto->l4proto); + goto err_put_timeout; + } + timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); + if (!timeout_ext) { + ret = -ENOMEM; + goto err_put_timeout; + } + + rcu_read_unlock(); + return ret; + +err_put_timeout: + __nf_ct_timeout_put(timeout); +out: + rcu_read_unlock(); + if (errmsg) + pr_info_ratelimited("%s\n", errmsg); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_set_timeout); + +void nf_ct_destroy_timeout(struct nf_conn *ct) +{ + struct nf_conn_timeout *timeout_ext; + typeof(nf_ct_timeout_put_hook) timeout_put; + + rcu_read_lock(); + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + + if (timeout_put) { + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) { + timeout_put(timeout_ext->timeout); + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + } + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout); + static const struct nf_ct_ext_type timeout_extend = { .len = sizeof(struct nf_conn_timeout), .align = __alignof__(struct nf_conn_timeout), diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 7aabfd4b1e50..4469519a4879 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -185,14 +185,25 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = (u32)jiffies; + int err; - rhashtable_insert_fast(&flow_table->rhashtable, - &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - nf_flow_offload_rhash_params); - rhashtable_insert_fast(&flow_table->rhashtable, - &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - nf_flow_offload_rhash_params); + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + if (err < 0) + return err; + + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[1].node, + nf_flow_offload_rhash_params); + if (err < 0) { + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + return err; + } + + flow->timeout = (u32)jiffies; return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -232,6 +243,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; + struct flow_offload_entry *e; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, @@ -244,6 +256,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table, if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) return NULL; + e = container_of(flow, struct flow_offload_entry, flow); + if (unlikely(nf_ct_is_dying(e->ct))) + return NULL; + return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -290,8 +306,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; + struct flow_offload_entry *e; - if (nf_flow_has_expired(flow) || + e = container_of(flow, struct flow_offload_entry, flow); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) || (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 1d291a51cd45..0d603e20b519 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -181,6 +181,9 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, iph->protocol != IPPROTO_UDP) return -1; + if (iph->ttl <= 1) + return -1; + thoff = iph->ihl * 4; if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; @@ -235,13 +238,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) @@ -411,6 +411,9 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, ip6h->nexthdr != IPPROTO_UDP) return -1; + if (ip6h->hop_limit <= 1) + return -1; + thoff = sizeof(*ip6h); if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; @@ -452,13 +455,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index e15779fd58e3..d6c43902ebd7 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -7,9 +7,6 @@ #include <linux/netdevice.h> /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict); void nf_queue_nf_hook_drop(struct net *net); /* nf_log.c */ diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index e4d61a7a5258..4e59416ea709 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -19,10 +19,15 @@ #include <net/netfilter/nf_nat_helper.h> #include <linux/netfilter/nf_conntrack_amanda.h> +#define NAT_HELPER_NAME "amanda" + MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda NAT helper"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_amanda"); +MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME); + +static struct nf_conntrack_nat_helper nat_helper_amanda = + NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME); static unsigned int help(struct sk_buff *skb, enum ip_conntrack_info ctinfo, @@ -74,6 +79,7 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_amanda_fini(void) { + nf_nat_helper_unregister(&nat_helper_amanda); RCU_INIT_POINTER(nf_nat_amanda_hook, NULL); synchronize_rcu(); } @@ -81,6 +87,7 @@ static void __exit nf_nat_amanda_fini(void) static int __init nf_nat_amanda_init(void) { BUG_ON(nf_nat_amanda_hook != NULL); + nf_nat_helper_register(&nat_helper_amanda); RCU_INIT_POINTER(nf_nat_amanda_hook, help); return 0; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index af7dc6537758..cd94481e6c07 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; - min = range->min_proto.icmp.id; - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + min = 0; + range_size = 65536; + } else { + min = ntohs(range->min_proto.icmp.id); + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: @@ -885,8 +890,8 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr, struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; - err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, - protonat_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, + protonat_nla_policy, NULL); if (err < 0) return err; @@ -944,7 +949,8 @@ nfnetlink_parse_nat(const struct nlattr *nat, memset(range, 0, sizeof(*range)); - err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); + err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, + nat_nla_policy, NULL); if (err < 0) return err; @@ -1009,7 +1015,7 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; -int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, +int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); @@ -1019,14 +1025,12 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, struct nf_hook_ops *nat_ops; int i, ret; - if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))) + if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; - nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { - if (WARN_ON(orig_nat_ops[i].pf != ops->pf)) - return -EINVAL; if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; @@ -1086,8 +1090,8 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, return ret; } -void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, - unsigned int ops_count) +void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, + unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; @@ -1096,10 +1100,10 @@ void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, int hooknum = ops->hooknum; int i; - if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)) + if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; - nat_proto_net = &nat_net->nat_proto_net[ops->pf]; + nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index 5063cbf1689c..0ea6b1bc52de 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -21,13 +21,18 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_ftp.h> +#define NAT_HELPER_NAME "ftp" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp NAT helper"); -MODULE_ALIAS("ip_nat_ftp"); +MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME); /* FIXME: Time out? --RR */ +static struct nf_conntrack_nat_helper nat_helper_ftp = + NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME); + static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type, char *buffer, size_t buflen, union nf_inet_addr *addr, u16 port) @@ -124,6 +129,7 @@ out: static void __exit nf_nat_ftp_fini(void) { + nf_nat_helper_unregister(&nat_helper_ftp); RCU_INIT_POINTER(nf_nat_ftp_hook, NULL); synchronize_rcu(); } @@ -131,6 +137,7 @@ static void __exit nf_nat_ftp_fini(void) static int __init nf_nat_ftp_init(void) { BUG_ON(nf_nat_ftp_hook != NULL); + nf_nat_helper_register(&nat_helper_ftp); RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp); return 0; } diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 3aa35a43100d..d87cbe5e03ec 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -23,10 +23,15 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_irc.h> +#define NAT_HELPER_NAME "irc" + MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("IRC (DCC) NAT helper"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_irc"); +MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME); + +static struct nf_conntrack_nat_helper nat_helper_irc = + NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME); static unsigned int help(struct sk_buff *skb, enum ip_conntrack_info ctinfo, @@ -96,6 +101,7 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_irc_fini(void) { + nf_nat_helper_unregister(&nat_helper_irc); RCU_INIT_POINTER(nf_nat_irc_hook, NULL); synchronize_rcu(); } @@ -103,6 +109,7 @@ static void __exit nf_nat_irc_fini(void) static int __init nf_nat_irc_init(void) { BUG_ON(nf_nat_irc_hook != NULL); + nf_nat_helper_register(&nat_helper_irc); RCU_INIT_POINTER(nf_nat_irc_hook, help); return 0; } diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index d85c4d902e7b..8e8a65d46345 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -7,12 +7,10 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> static DEFINE_MUTEX(masq_mutex); -static unsigned int masq_refcnt4 __read_mostly; -static unsigned int masq_refcnt6 __read_mostly; +static unsigned int masq_refcnt __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -137,56 +135,6 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -int nf_nat_masquerade_ipv4_register_notifier(void) -{ - int ret = 0; - - mutex_lock(&masq_mutex); - if (WARN_ON_ONCE(masq_refcnt4 == UINT_MAX)) { - ret = -EOVERFLOW; - goto out_unlock; - } - - /* check if the notifier was already set */ - if (++masq_refcnt4 > 1) - goto out_unlock; - - /* Register for device down reports */ - ret = register_netdevice_notifier(&masq_dev_notifier); - if (ret) - goto err_dec; - /* Register IP address change reports */ - ret = register_inetaddr_notifier(&masq_inet_notifier); - if (ret) - goto err_unregister; - - mutex_unlock(&masq_mutex); - return ret; - -err_unregister: - unregister_netdevice_notifier(&masq_dev_notifier); -err_dec: - masq_refcnt4--; -out_unlock: - mutex_unlock(&masq_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); - -void nf_nat_masquerade_ipv4_unregister_notifier(void) -{ - mutex_lock(&masq_mutex); - /* check if the notifier still has clients */ - if (--masq_refcnt4 > 0) - goto out_unlock; - - unregister_netdevice_notifier(&masq_dev_notifier); - unregister_inetaddr_notifier(&masq_inet_notifier); -out_unlock: - mutex_unlock(&masq_mutex); -} -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); - #if IS_ENABLED(CONFIG_IPV6) static atomic_t v6_worker_count __read_mostly; @@ -322,44 +270,68 @@ static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; -int nf_nat_masquerade_ipv6_register_notifier(void) +static int nf_nat_masquerade_ipv6_register_notifier(void) +{ + return register_inet6addr_notifier(&masq_inet6_notifier); +} +#else +static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } +#endif + +int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); - if (WARN_ON_ONCE(masq_refcnt6 == UINT_MAX)) { + if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } - /* check if the notifier is already set */ - if (++masq_refcnt6 > 1) + /* check if the notifier was already set */ + if (++masq_refcnt > 1) goto out_unlock; - ret = register_inet6addr_notifier(&masq_inet6_notifier); + /* Register for device down reports */ + ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; + /* Register IP address change reports */ + ret = register_inetaddr_notifier(&masq_inet_notifier); + if (ret) + goto err_unregister; + + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (ret) + goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; +err_unreg_inet: + unregister_inetaddr_notifier(&masq_inet_notifier); +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); err_dec: - masq_refcnt6--; + masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); +EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); -void nf_nat_masquerade_ipv6_unregister_notifier(void) +void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); - /* check if the notifier still has clients */ - if (--masq_refcnt6 > 0) + /* check if the notifiers still have clients */ + if (--masq_refcnt > 0) goto out_unlock; + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +#if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); +#endif out_unlock: mutex_unlock(&masq_mutex); } -EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); -#endif +EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers); diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 62743da3004f..84f5c90a7f21 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -725,7 +725,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, return ret; } -static const struct nf_hook_ops nf_nat_ipv4_ops[] = { +const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_in, @@ -758,13 +758,14 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, + ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { - nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); @@ -925,20 +926,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, return ret; } -static int nat_route_me_harder(struct net *net, struct sk_buff *skb) -{ -#ifdef CONFIG_IPV6_MODULE - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return -EHOSTUNREACH; - - return v6_ops->route_me_harder(net, skb); -#else - return ip6_route_me_harder(net, skb); -#endif -} - static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -958,7 +945,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { - err = nat_route_me_harder(state->net, skb); + err = nf_ip6_route_me_harder(state->net, skb); if (err < 0) ret = NF_DROP_ERR(err); } @@ -977,7 +964,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, return ret; } -static const struct nf_hook_ops nf_nat_ipv6_ops[] = { +const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, @@ -1010,14 +997,44 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { - return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, + return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { - nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); + nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ + +#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) +int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) +{ + int ret; + + if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) + return -EINVAL; + + ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, + ARRAY_SIZE(nf_nat_ipv6_ops)); + if (ret) + return ret; + + ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, + ARRAY_SIZE(nf_nat_ipv4_ops)); + if (ret) + nf_nat_ipv6_unregister_fn(net, ops); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); + +void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); + nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); +} +EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); +#endif /* NFT INET NAT */ diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index aa1be643d7a0..464387b3600f 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -24,11 +24,15 @@ #include <net/netfilter/nf_conntrack_seqadj.h> #include <linux/netfilter/nf_conntrack_sip.h> +#define NAT_HELPER_NAME "sip" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); MODULE_DESCRIPTION("SIP NAT helper"); -MODULE_ALIAS("ip_nat_sip"); +MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME); +static struct nf_conntrack_nat_helper nat_helper_sip = + NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME); static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff, unsigned int dataoff, @@ -656,8 +660,8 @@ static struct nf_ct_helper_expectfn sip_nat = { static void __exit nf_nat_sip_fini(void) { + nf_nat_helper_unregister(&nat_helper_sip); RCU_INIT_POINTER(nf_nat_sip_hooks, NULL); - nf_ct_helper_expectfn_unregister(&sip_nat); synchronize_rcu(); } @@ -675,6 +679,7 @@ static const struct nf_nat_sip_hooks sip_hooks = { static int __init nf_nat_sip_init(void) { BUG_ON(nf_nat_sip_hooks != NULL); + nf_nat_helper_register(&nat_helper_sip); RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks); nf_ct_helper_expectfn_register(&sip_nat); return 0; diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c index 7f67e1d5310d..e633b3863e33 100644 --- a/net/netfilter/nf_nat_tftp.c +++ b/net/netfilter/nf_nat_tftp.c @@ -13,10 +13,15 @@ #include <net/netfilter/nf_nat_helper.h> #include <linux/netfilter/nf_conntrack_tftp.h> +#define NAT_HELPER_NAME "tftp" + MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); MODULE_DESCRIPTION("TFTP NAT helper"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("ip_nat_tftp"); +MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME); + +static struct nf_conntrack_nat_helper nat_helper_tftp = + NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME); static unsigned int help(struct sk_buff *skb, enum ip_conntrack_info ctinfo, @@ -37,6 +42,7 @@ static unsigned int help(struct sk_buff *skb, static void __exit nf_nat_tftp_fini(void) { + nf_nat_helper_unregister(&nat_helper_tftp); RCU_INIT_POINTER(nf_nat_tftp_hook, NULL); synchronize_rcu(); } @@ -44,6 +50,7 @@ static void __exit nf_nat_tftp_fini(void) static int __init nf_nat_tftp_init(void) { BUG_ON(nf_nat_tftp_hook != NULL); + nf_nat_helper_register(&nat_helper_tftp); RCU_INIT_POINTER(nf_nat_tftp_hook, help); return 0; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index a36a77bae1d6..9dc1d6e04946 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -240,6 +240,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, return 0; } +EXPORT_SYMBOL_GPL(nf_queue); static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ef7772e976cc..28241e82fd15 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -53,7 +53,6 @@ static const struct rhashtable_params nft_chain_ht_params = { .hashfn = nft_chain_hash, .obj_hashfn = nft_chain_hash_obj, .obj_cmpfn = nft_chain_hash_cmp, - .locks_mul = 1, .automatic_shrinking = true, }; @@ -214,33 +213,33 @@ static int nft_deltable(struct nft_ctx *ctx) return err; } -static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) +static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain)); if (trans == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (msg_type == NFT_MSG_NEWCHAIN) nft_activate_next(ctx->net, ctx->chain); list_add_tail(&trans->list, &ctx->net->nft.commit_list); - return 0; + return trans; } static int nft_delchain(struct nft_ctx *ctx) { - int err; + struct nft_trans *trans; - err = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); - if (err < 0) - return err; + trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN); + if (IS_ERR(trans)) + return PTR_ERR(trans); ctx->table->use--; nft_deactivate_next(ctx->net, ctx->chain); - return err; + return 0; } static void nft_rule_expr_activate(const struct nft_ctx *ctx, @@ -1190,6 +1189,9 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) u64 pkts, bytes; int cpu; + if (!stats) + return 0; + memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); @@ -1201,7 +1203,7 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) total.pkts += pkts; total.bytes += bytes; } - nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS); + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS); if (nest == NULL) goto nla_put_failure; @@ -1247,9 +1249,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); const struct nf_hook_ops *ops = &basechain->ops; + struct nft_stats __percpu *stats; struct nlattr *nest; - nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); + nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum))) @@ -1268,8 +1271,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (rcu_access_pointer(basechain->stats) && - nft_dump_stats(skb, rcu_dereference(basechain->stats))) + stats = rcu_dereference_check(basechain->stats, + lockdep_commit_lock_is_held(net)); + if (nft_dump_stats(skb, stats)) goto nla_put_failure; } @@ -1421,8 +1425,8 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) struct nft_stats *stats; int err; - err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, - NULL); + err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, + nft_counter_policy, NULL); if (err < 0) return ERR_PTR(err); @@ -1526,8 +1530,9 @@ static int nft_chain_parse_hook(struct net *net, lockdep_assert_held(&net->nft.commit_mutex); lockdep_nfnl_nft_mutex_not_held(); - err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], - nft_hook_policy, NULL); + err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, + nla[NFTA_CHAIN_HOOK], + nft_hook_policy, NULL); if (err < 0) return err; @@ -1545,7 +1550,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (!(type->hook_mask & (1 << hook->num))) + if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && @@ -1615,6 +1620,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; int err; @@ -1662,7 +1668,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, ops->dev = hook.dev; chain->flags |= NFT_BASE_CHAIN; - basechain->policy = policy; + basechain->policy = NF_ACCEPT; } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1698,13 +1704,18 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err) goto err2; - err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); - if (err < 0) { + trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); rhltable_remove(&table->chains_ht, &chain->rhlhead, nft_chain_ht_params); goto err2; } + nft_trans_chain_policy(trans) = -1; + if (nft_is_base_chain(chain)) + nft_trans_chain_policy(trans) = policy; + table->use++; list_add_tail_rcu(&chain->list, &table->chains); @@ -2060,7 +2071,8 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, goto nla_put_failure; if (expr->ops->dump) { - struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA); + struct nlattr *data = nla_nest_start_noflag(skb, + NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; if (expr->ops->dump(skb, expr) < 0) @@ -2079,7 +2091,7 @@ int nft_expr_dump(struct sk_buff *skb, unsigned int attr, { struct nlattr *nest; - nest = nla_nest_start(skb, attr); + nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (nf_tables_fill_expr_info(skb, expr) < 0) @@ -2105,7 +2117,8 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_EXPR_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, + nft_expr_policy, NULL); if (err < 0) return err; @@ -2114,8 +2127,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, return PTR_ERR(type); if (tb[NFTA_EXPR_DATA]) { - err = nla_parse_nested(info->tb, type->maxattr, - tb[NFTA_EXPR_DATA], type->policy, NULL); + err = nla_parse_nested_deprecated(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], + type->policy, NULL); if (err < 0) goto err1; } else @@ -2290,7 +2304,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; } - list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS); + list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { @@ -3194,9 +3208,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) static __be64 nf_jiffies64_to_msecs(u64 input) { - u64 ms = jiffies64_to_nsecs(input); - - return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC)); + return cpu_to_be64(jiffies64_to_msecs(input)); } static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, @@ -3261,7 +3273,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; - desc = nla_nest_start(skb, NFTA_SET_DESC); + desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); if (desc == NULL) goto nla_put_failure; if (set->size && @@ -3439,15 +3451,14 @@ err: return err; } -static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, - struct nft_set_desc *desc, +static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { struct nlattr *da[NFTA_SET_DESC_MAX + 1]; int err; - err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, - nft_set_desc_policy, NULL); + err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla, + nft_set_desc_policy, NULL); if (err < 0) return err; @@ -3566,7 +3577,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY])); if (nla[NFTA_SET_DESC] != NULL) { - err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]); + err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]); if (err < 0) return err; } @@ -3786,8 +3797,8 @@ bind: } EXPORT_SYMBOL_GPL(nf_tables_bind_set); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding, bool event) +static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding, bool event) { list_del_rcu(&binding->list); @@ -3798,7 +3809,6 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, GFP_KERNEL); } } -EXPORT_SYMBOL_GPL(nf_tables_unbind_set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, @@ -3913,7 +3923,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; - nest = nla_nest_start(skb, NFTA_LIST_ELEM); + nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) goto nla_put_failure; @@ -4057,7 +4067,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; - nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; @@ -4129,7 +4139,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nla_put_string(skb, NFTA_SET_NAME, set->name)) goto nla_put_failure; - nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS); + nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS); if (nest == NULL) goto nla_put_failure; @@ -4174,8 +4184,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, void *priv; int err; - err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, - nft_set_elem_policy, NULL); + err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy, NULL); if (err < 0) return err; @@ -4406,8 +4416,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, u8 ulen; int err; - err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, - nft_set_elem_policy, NULL); + err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy, NULL); if (err < 0) return err; @@ -4700,8 +4710,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, void *priv; int err; - err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, - nft_set_elem_policy, NULL); + err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, + nft_set_elem_policy, NULL); if (err < 0) goto err1; @@ -4975,8 +4985,8 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, goto err1; if (attr) { - err = nla_parse_nested(tb, type->maxattr, attr, type->policy, - NULL); + err = nla_parse_nested_deprecated(tb, type->maxattr, attr, + type->policy, NULL); if (err < 0) goto err2; } else { @@ -5019,7 +5029,7 @@ static int nft_object_dump(struct sk_buff *skb, unsigned int attr, { struct nlattr *nest; - nest = nla_nest_start(skb, attr); + nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; if (obj->ops->dump(skb, obj, reset) < 0) @@ -5552,8 +5562,8 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, int hooknum, priority; int err, n = 0, i; - err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, - nft_flowtable_hook_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, + nft_flowtable_hook_policy, NULL); if (err < 0) return err; @@ -5836,14 +5846,14 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, NFTA_FLOWTABLE_PAD)) goto nla_put_failure; - nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); + nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK); if (!nest) goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) goto nla_put_failure; - nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS); + nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS); if (!nest_devs) goto nla_put_failure; @@ -6311,6 +6321,27 @@ static int nf_tables_validate(struct net *net) return 0; } +/* a drop policy has to be deferred until all rules have been activated, + * otherwise a large ruleset that contains a drop-policy base chain will + * cause all packets to get dropped until the full transaction has been + * processed. + * + * We defer the drop policy until the transaction has been finalized. + */ +static void nft_chain_commit_drop_policy(struct nft_trans *trans) +{ + struct nft_base_chain *basechain; + + if (nft_trans_chain_policy(trans) != NF_DROP) + return; + + if (!nft_is_base_chain(trans->ctx.chain)) + return; + + basechain = nft_base_chain(trans->ctx.chain); + basechain->policy = NF_DROP; +} + static void nft_chain_commit_update(struct nft_trans *trans) { struct nft_base_chain *basechain; @@ -6632,6 +6663,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); /* trans destroyed after rcu grace period */ } else { + nft_chain_commit_drop_policy(trans); nft_clear(net, trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN); nft_trans_destroy(trans); @@ -7210,8 +7242,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_chain *chain; int err; - err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy, - NULL); + err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla, + nft_verdict_policy, NULL); if (err < 0) return err; @@ -7269,7 +7301,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v) { struct nlattr *nest; - nest = nla_nest_start(skb, type); + nest = nla_nest_start_noflag(skb, type); if (!nest) goto nla_put_failure; @@ -7341,7 +7373,8 @@ int nft_data_init(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla, + nft_data_policy, NULL); if (err < 0) return err; @@ -7382,7 +7415,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, struct nlattr *nest; int err; - nest = nla_nest_start(skb, attr); + nest = nla_nest_start_noflag(skb, attr); if (nest == NULL) return -1; @@ -7534,6 +7567,7 @@ static int __init nf_tables_module_init(void) if (err < 0) goto err5; + nft_chain_route_init(); return err; err5: rhltable_destroy(&nft_objname_ht); @@ -7553,6 +7587,7 @@ static void __exit nf_tables_module_exit(void) nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); + nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_destroy_work); rcu_barrier(); diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index 814789644bd3..a9fce8d10051 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> #include <net/netfilter/nf_tables_core.h> static int __init nf_tables_set_module_init(void) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 916913454624..92077d459109 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -206,8 +206,9 @@ replay: return -ENOMEM; } - err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, - ss->cb[cb_id].policy, extack); + err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, + attr, attrlen, + ss->cb[cb_id].policy, extack); if (err < 0) { rcu_read_unlock(); return err; @@ -421,8 +422,10 @@ replay: goto ack; } - err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, - attrlen, ss->cb[cb_id].policy, NULL); + err = nla_parse_deprecated(cda, + ss->cb[cb_id].attr_count, + attr, attrlen, + ss->cb[cb_id].policy, NULL); if (err < 0) goto ack; @@ -520,8 +523,8 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; - err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, - NULL); + err = nla_parse_deprecated(cda, NFNL_BATCH_MAX, attr, attrlen, + nfnl_batch_policy, NULL); if (err < 0) { netlink_ack(skb, nlh, err, NULL); return; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 8fa8bf7c48e6..02c877432d71 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -248,8 +248,8 @@ static int nfnl_acct_start(struct netlink_callback *cb) if (!attr) return 0; - err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy, - NULL); + err = nla_parse_nested_deprecated(tb, NFACCT_FILTER_MAX, attr, + filter_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index e5d27b2e4eba..17eb473a626b 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -78,8 +78,8 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, int err; struct nlattr *tb[NFCTH_TUPLE_MAX+1]; - err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, - nfnl_cthelper_tuple_pol, NULL); + err = nla_parse_nested_deprecated(tb, NFCTH_TUPLE_MAX, attr, + nfnl_cthelper_tuple_pol, NULL); if (err < 0) return err; @@ -139,8 +139,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, int err; struct nlattr *tb[NFCTH_POLICY_MAX+1]; - err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, - nfnl_cthelper_expect_pol, NULL); + err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, + nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; @@ -176,8 +176,9 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; unsigned int class_max; - ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, - nfnl_cthelper_expect_policy_set, NULL); + ret = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, + nfnl_cthelper_expect_policy_set, + NULL); if (ret < 0) return ret; @@ -289,8 +290,8 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, struct nlattr *tb[NFCTH_POLICY_MAX + 1]; int err; - err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, - nfnl_cthelper_expect_pol, NULL); + err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, + nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; @@ -361,8 +362,9 @@ static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, unsigned int class_max; int err; - err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, - nfnl_cthelper_expect_policy_set, NULL); + err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, + nfnl_cthelper_expect_policy_set, + NULL); if (err < 0) return err; @@ -462,7 +464,7 @@ nfnl_cthelper_dump_tuple(struct sk_buff *skb, { struct nlattr *nest_parms; - nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, NFCTH_TUPLE); if (nest_parms == NULL) goto nla_put_failure; @@ -487,7 +489,7 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb, int i; struct nlattr *nest_parms1, *nest_parms2; - nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED); + nest_parms1 = nla_nest_start(skb, NFCTH_POLICY); if (nest_parms1 == NULL) goto nla_put_failure; @@ -496,8 +498,7 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb, goto nla_put_failure; for (i = 0; i < helper->expect_class_max + 1; i++) { - nest_parms2 = nla_nest_start(skb, - (NFCTH_POLICY_SET+i) | NLA_F_NESTED); + nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET + i)); if (nest_parms2 == NULL) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index c69b11ca5aad..427b411c5739 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -59,8 +59,11 @@ ctnl_timeout_parse_policy(void *timeout, if (!tb) return -ENOMEM; - ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, attr, - l4proto->ctnl_timeout.nla_policy, NULL); + ret = nla_parse_nested_deprecated(tb, + l4proto->ctnl_timeout.nlattr_max, + attr, + l4proto->ctnl_timeout.nla_policy, + NULL); if (ret < 0) goto err; @@ -184,7 +187,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; - nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; @@ -401,7 +404,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; - nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b1f9c5303f02..0b3347570265 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - if (skb->tstamp) { + if (hooknum <= NF_INET_FORWARD && skb->tstamp) { struct nfulnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 1f1d90c1716b..7b827bcb412c 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -255,9 +255,9 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, } EXPORT_SYMBOL_GPL(nf_osf_match); -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers, - const int ttl_check) +bool nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers, + const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; @@ -265,24 +265,24 @@ const char *nf_osf_find(const struct sk_buff *skb, const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; - const char *genre = NULL; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); if (!tcp) - return NULL; + return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; - genre = f->genre; + data->genre = f->genre; + data->version = f->version; break; } - return genre; + return true; } EXPORT_SYMBOL_GPL(nf_osf_find); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 0dcc3592d053..27dac47b29c2 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -351,7 +351,7 @@ static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; - nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED); + nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; @@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; - if (entskb->tstamp) { + if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); @@ -1139,8 +1139,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; - err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], - nfqa_vlan_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, + nfqa[NFQA_VLAN], + nfqa_vlan_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c index ee4852088d50..2f89bde3c61c 100644 --- a/net/netfilter/nft_chain_nat.c +++ b/net/netfilter/nft_chain_nat.c @@ -74,6 +74,36 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = { }; #endif +#ifdef CONFIG_NF_TABLES_INET +static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops) +{ + return nf_nat_inet_register_fn(net, ops); +} + +static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops) +{ + nf_nat_inet_unregister_fn(net, ops); +} + +static const struct nft_chain_type nft_chain_nat_inet = { + .name = "nat", + .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_PRE_ROUTING] = nft_nat_do_chain, + [NF_INET_LOCAL_IN] = nft_nat_do_chain, + [NF_INET_LOCAL_OUT] = nft_nat_do_chain, + [NF_INET_POST_ROUTING] = nft_nat_do_chain, + }, + .ops_register = nft_nat_inet_reg, + .ops_unregister = nft_nat_inet_unreg, +}; +#endif + static int __init nft_chain_nat_init(void) { #ifdef CONFIG_NF_TABLES_IPV6 @@ -82,6 +112,9 @@ static int __init nft_chain_nat_init(void) #ifdef CONFIG_NF_TABLES_IPV4 nft_register_chain_type(&nft_chain_nat_ipv4); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_register_chain_type(&nft_chain_nat_inet); +#endif return 0; } @@ -94,6 +127,9 @@ static void __exit nft_chain_nat_exit(void) #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_chain_type(&nft_chain_nat_ipv6); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_chain_type(&nft_chain_nat_inet); +#endif } module_init(nft_chain_nat_init); diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c new file mode 100644 index 000000000000..8826bbe71136 --- /dev/null +++ b/net/netfilter/nft_chain_route.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_ipv4.h> +#include <net/netfilter/nf_tables_ipv6.h> +#include <net/route.h> +#include <net/ip.h> + +#ifdef CONFIG_NF_TABLES_IPV4 +static unsigned int nf_route_table_hook4(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + const struct iphdr *iph; + struct nft_pktinfo pkt; + __be32 saddr, daddr; + unsigned int ret; + u32 mark; + int err; + u8 tos; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); + + mark = skb->mark; + iph = ip_hdr(skb); + saddr = iph->saddr; + daddr = iph->daddr; + tos = iph->tos; + + ret = nft_do_chain(&pkt, priv); + if (ret == NF_ACCEPT) { + iph = ip_hdr(skb); + + if (iph->saddr != saddr || + iph->daddr != daddr || + skb->mark != mark || + iph->tos != tos) { + err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); + if (err < 0) + ret = NF_DROP_ERR(err); + } + } + return ret; +} + +static const struct nft_chain_type nft_chain_route_ipv4 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook4, + }, +}; +#endif + +#ifdef CONFIG_NF_TABLES_IPV6 +static unsigned int nf_route_table_hook6(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct in6_addr saddr, daddr; + struct nft_pktinfo pkt; + u32 mark, flowlabel; + unsigned int ret; + u8 hop_limit; + int err; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); + + /* save source/dest address, mark, hoplimit, flowlabel, priority */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either)*/ + flowlabel = *((u32 *)ipv6_hdr(skb)); + + ret = nft_do_chain(&pkt, priv); + if (ret == NF_ACCEPT && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u32 *)ipv6_hdr(skb)))) { + err = nf_ip6_route_me_harder(state->net, skb); + if (err < 0) + ret = NF_DROP_ERR(err); + } + + return ret; +} + +static const struct nft_chain_type nft_chain_route_ipv6 = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV6, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_hook6, + }, +}; +#endif + +#ifdef CONFIG_NF_TABLES_INET +static unsigned int nf_route_table_inet(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + switch (state->pf) { + case NFPROTO_IPV4: + return nf_route_table_hook4(priv, skb, state); + case NFPROTO_IPV6: + return nf_route_table_hook6(priv, skb, state); + default: + nft_set_pktinfo(&pkt, skb, state); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_route_inet = { + .name = "route", + .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_LOCAL_OUT), + .hooks = { + [NF_INET_LOCAL_OUT] = nf_route_table_inet, + }, +}; +#endif + +void __init nft_chain_route_init(void) +{ +#ifdef CONFIG_NF_TABLES_IPV6 + nft_register_chain_type(&nft_chain_route_ipv6); +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + nft_register_chain_type(&nft_chain_route_ipv4); +#endif +#ifdef CONFIG_NF_TABLES_INET + nft_register_chain_type(&nft_chain_route_inet); +#endif +} + +void __exit nft_chain_route_fini(void) +{ +#ifdef CONFIG_NF_TABLES_IPV6 + nft_unregister_chain_type(&nft_chain_route_ipv6); +#endif +#ifdef CONFIG_NF_TABLES_IPV4 + nft_unregister_chain_type(&nft_chain_route_ipv4); +#endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_chain_type(&nft_chain_route_inet); +#endif +} diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 469f9da5073b..276f1f2d6de1 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -198,8 +198,8 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) u32 flags; int err; - err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, - nft_rule_compat_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_RULE_COMPAT_MAX, attr, + nft_rule_compat_policy, NULL); if (err < 0) return err; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 7b717fad6cdc..f043936763f3 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -178,6 +178,11 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; } #endif + case NFT_CT_ID: + if (!nf_ct_is_confirmed(ct)) + goto err; + *dest = nf_ct_get_id(ct); + return; default: break; } @@ -479,6 +484,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, len = sizeof(u16); break; #endif + case NFT_CT_ID: + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } @@ -797,9 +805,11 @@ nft_ct_timeout_parse_policy(void *timeouts, if (!tb) return -ENOMEM; - ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, - attr, l4proto->ctnl_timeout.nla_policy, - NULL); + ret = nla_parse_nested_deprecated(tb, + l4proto->ctnl_timeout.nlattr_max, + attr, + l4proto->ctnl_timeout.nla_policy, + NULL); if (ret < 0) goto err; @@ -928,7 +938,7 @@ static int nft_ct_timeout_obj_dump(struct sk_buff *skb, nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) return -1; - nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED); + nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA); if (!nest_params) return -1; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index e461007558e8..8394560aa695 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -28,6 +28,23 @@ struct nft_dynset { struct nft_set_binding binding; }; +static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +{ + int err; + + if (src->ops->clone) { + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; + } else { + memcpy(dst, src, src->ops->size); + } + + __module_get(src->ops->type->owner); + return 0; +} + static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 6e6b9adf7d38..69d7a8439c7a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -94,8 +94,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (help) goto out; - if (ctinfo == IP_CT_NEW || - ctinfo == IP_CT_RELATED) + if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) @@ -113,6 +112,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (ret < 0) goto err_flow_add; + dst_release(route.tuple[!dir].dst); return; err_flow_add: diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index bee156eaa400..86fd90085eaf 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -14,8 +14,7 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> -#include <net/netfilter/ipv6/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> struct nft_masq { u32 flags; @@ -196,28 +195,73 @@ static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { static int __init nft_masq_module_init_ipv6(void) { - int ret = nft_register_expr(&nft_masq_ipv6_type); - - if (ret) - return ret; - - ret = nf_nat_masquerade_ipv6_register_notifier(); - if (ret < 0) - nft_unregister_expr(&nft_masq_ipv6_type); - - return ret; + return nft_register_expr(&nft_masq_ipv6_type); } static void nft_masq_module_exit_ipv6(void) { nft_unregister_expr(&nft_masq_ipv6_type); - nf_nat_masquerade_ipv6_unregister_notifier(); } #else static inline int nft_masq_module_init_ipv6(void) { return 0; } static inline void nft_masq_module_exit_ipv6(void) {} #endif +#ifdef CONFIG_NF_TABLES_INET +static void nft_masq_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return nft_masq_ipv4_eval(expr, regs, pkt); + case NFPROTO_IPV6: + return nft_masq_ipv6_eval(expr, regs, pkt); + } + + WARN_ON_ONCE(1); +} + +static void +nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_INET); +} + +static struct nft_expr_type nft_masq_inet_type; +static const struct nft_expr_ops nft_masq_inet_ops = { + .type = &nft_masq_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_inet_eval, + .init = nft_masq_init, + .destroy = nft_masq_inet_destroy, + .dump = nft_masq_dump, + .validate = nft_masq_validate, +}; + +static struct nft_expr_type nft_masq_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "masq", + .ops = &nft_masq_inet_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_module_init_inet(void) +{ + return nft_register_expr(&nft_masq_inet_type); +} + +static void nft_masq_module_exit_inet(void) +{ + nft_unregister_expr(&nft_masq_inet_type); +} +#else +static inline int nft_masq_module_init_inet(void) { return 0; } +static inline void nft_masq_module_exit_inet(void) {} +#endif + static int __init nft_masq_module_init(void) { int ret; @@ -226,15 +270,23 @@ static int __init nft_masq_module_init(void) if (ret < 0) return ret; + ret = nft_masq_module_init_inet(); + if (ret < 0) { + nft_masq_module_exit_ipv6(); + return ret; + } + ret = nft_register_expr(&nft_masq_ipv4_type); if (ret < 0) { + nft_masq_module_exit_inet(); nft_masq_module_exit_ipv6(); return ret; } - ret = nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_inet_register_notifiers(); if (ret < 0) { nft_masq_module_exit_ipv6(); + nft_masq_module_exit_inet(); nft_unregister_expr(&nft_masq_ipv4_type); return ret; } @@ -245,8 +297,9 @@ static int __init nft_masq_module_init(void) static void __exit nft_masq_module_exit(void) { nft_masq_module_exit_ipv6(); + nft_masq_module_exit_inet(); nft_unregister_expr(&nft_masq_ipv4_type); - nf_nat_masquerade_ipv4_unregister_notifier(); + nf_nat_masquerade_inet_unregister_notifiers(); } module_init(nft_masq_module_init); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index e93aed9bda88..d90d421826aa 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -140,7 +140,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); - if (family != ctx->family) + if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { @@ -278,13 +278,67 @@ static struct nft_expr_type nft_nat_type __read_mostly = { .owner = THIS_MODULE, }; +#ifdef CONFIG_NF_TABLES_INET +static void nft_nat_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_nat *priv = nft_expr_priv(expr); + + if (priv->family == nft_pf(pkt)) + nft_nat_eval(expr, regs, pkt); +} + +static const struct nft_expr_ops nft_nat_inet_ops = { + .type = &nft_nat_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), + .eval = nft_nat_inet_eval, + .init = nft_nat_init, + .destroy = nft_nat_destroy, + .dump = nft_nat_dump, + .validate = nft_nat_validate, +}; + +static struct nft_expr_type nft_inet_nat_type __read_mostly = { + .name = "nat", + .family = NFPROTO_INET, + .ops = &nft_nat_inet_ops, + .policy = nft_nat_policy, + .maxattr = NFTA_NAT_MAX, + .owner = THIS_MODULE, +}; + +static int nft_nat_inet_module_init(void) +{ + return nft_register_expr(&nft_inet_nat_type); +} + +static void nft_nat_inet_module_exit(void) +{ + nft_unregister_expr(&nft_inet_nat_type); +} +#else +static int nft_nat_inet_module_init(void) { return 0; } +static void nft_nat_inet_module_exit(void) { } +#endif + static int __init nft_nat_module_init(void) { - return nft_register_expr(&nft_nat_type); + int ret = nft_nat_inet_module_init(); + + if (ret) + return ret; + + ret = nft_register_expr(&nft_nat_type); + if (ret) + nft_nat_inet_module_exit(); + + return ret; } static void __exit nft_nat_module_exit(void) { + nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index b13618c764ec..87b60d6617ef 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -7,11 +7,13 @@ struct nft_osf { enum nft_registers dreg:8; u8 ttl; + u32 flags; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, [NFTA_OSF_TTL] = { .type = NLA_U8 }, + [NFTA_OSF_FLAGS] = { .type = NLA_U32 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -20,9 +22,10 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; + char os_match[NFT_OSF_MAXGENRELEN + 1]; const struct tcphdr *tcp; + struct nf_osf_data data; struct tcphdr _tcph; - const char *os_name; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); @@ -35,11 +38,17 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl); - if (!os_name) + if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); - else - strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN); + } else { + if (priv->flags & NFT_OSF_F_VERSION) + snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", + data.genre, data.version); + else + strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); + + strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + } } static int nft_osf_init(const struct nft_ctx *ctx, @@ -47,6 +56,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_osf *priv = nft_expr_priv(expr); + u32 flags; int err; u8 ttl; @@ -57,6 +67,13 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->ttl = ttl; } + if (tb[NFTA_OSF_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS])); + if (flags != NFT_OSF_F_VERSION) + return -EINVAL; + priv->flags = flags; + } + priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); @@ -73,6 +90,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags))) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a340cd8a751b..da74fdc4a684 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -82,7 +82,7 @@ static int nft_redir_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_redir *priv = nft_expr_priv(expr); @@ -202,6 +202,55 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { }; #endif +#ifdef CONFIG_NF_TABLES_INET +static void nft_redir_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return nft_redir_ipv4_eval(expr, regs, pkt); + case NFPROTO_IPV6: + return nft_redir_ipv6_eval(expr, regs, pkt); + } + + WARN_ON_ONCE(1); +} + +static void +nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) +{ + nf_ct_netns_put(ctx->net, NFPROTO_INET); +} + +static struct nft_expr_type nft_redir_inet_type; +static const struct nft_expr_ops nft_redir_inet_ops = { + .type = &nft_redir_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_inet_eval, + .init = nft_redir_init, + .destroy = nft_redir_inet_destroy, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "redir", + .ops = &nft_redir_inet_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_module_init_inet(void) +{ + return nft_register_expr(&nft_redir_inet_type); +} +#else +static inline int nft_redir_module_init_inet(void) { return 0; } +#endif + static int __init nft_redir_module_init(void) { int ret = nft_register_expr(&nft_redir_ipv4_type); @@ -217,6 +266,15 @@ static int __init nft_redir_module_init(void) } #endif + ret = nft_redir_module_init_inet(); + if (ret < 0) { + nft_unregister_expr(&nft_redir_ipv4_type); +#ifdef CONFIG_NF_TABLES_IPV6 + nft_unregister_expr(&nft_redir_ipv6_type); +#endif + return ret; + } + return ret; } @@ -226,6 +284,9 @@ static void __exit nft_redir_module_exit(void) #ifdef CONFIG_NF_TABLES_IPV6 nft_unregister_expr(&nft_redir_ipv6_type); #endif +#ifdef CONFIG_NF_TABLES_INET + nft_unregister_expr(&nft_redir_inet_type); +#endif } module_init(nft_redir_module_init); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index b113fcac94e1..3d4c2ae605a8 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -166,8 +166,8 @@ static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr, - nft_tunnel_ip_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP_MAX, attr, + nft_tunnel_ip_policy, NULL); if (err < 0) return err; @@ -195,8 +195,8 @@ static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr, - nft_tunnel_ip6_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr, + nft_tunnel_ip6_policy, NULL); if (err < 0) return err; @@ -231,8 +231,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr, - nft_tunnel_opts_vxlan_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr, + nft_tunnel_opts_vxlan_policy, NULL); if (err < 0) return err; @@ -260,8 +260,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, uint8_t hwid, dir; int err, version; - err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr, - nft_tunnel_opts_erspan_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, + attr, nft_tunnel_opts_erspan_policy, + NULL); if (err < 0) return err; @@ -309,8 +310,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr, - nft_tunnel_opts_policy, NULL); + err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr, + nft_tunnel_opts_policy, NULL); if (err < 0) return err; @@ -437,7 +438,7 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) struct nlattr *nest; if (info->mode & IP_TUNNEL_INFO_IPV6) { - nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6); + nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP6); if (!nest) return -1; @@ -448,7 +449,7 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) nla_nest_end(skb, nest); } else { - nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP); + nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP); if (!nest) return -1; @@ -468,7 +469,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_opts *opts = &priv->opts; struct nlattr *nest; - nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS); + nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e5e5c64df8d1..0a6656ed1534 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -227,7 +227,7 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) EXPORT_SYMBOL_GPL(xt_request_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ -struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) +static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = -ENOENT; @@ -255,7 +255,6 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) return ERR_PTR(err); } -EXPORT_SYMBOL(xt_find_target); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0fa863f57575..d59cb4730fac 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -103,85 +103,24 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return 0; } -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT -static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout) -{ - typeof(nf_ct_timeout_put_hook) timeout_put; - - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); - if (timeout_put) - timeout_put(timeout); -} -#endif - static int xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, const char *timeout_name) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT - typeof(nf_ct_timeout_find_get_hook) timeout_find_get; const struct nf_conntrack_l4proto *l4proto; - struct nf_ct_timeout *timeout; - struct nf_conn_timeout *timeout_ext; - const char *errmsg = NULL; - int ret = 0; u8 proto; - rcu_read_lock(); - timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); - if (timeout_find_get == NULL) { - ret = -ENOENT; - errmsg = "Timeout policy base is empty"; - goto out; - } - proto = xt_ct_find_proto(par); if (!proto) { - ret = -EINVAL; - errmsg = "You must specify a L4 protocol and not use inversions on it"; - goto out; - } - - timeout = timeout_find_get(par->net, timeout_name); - if (timeout == NULL) { - ret = -ENOENT; - pr_info_ratelimited("No such timeout policy \"%s\"\n", - timeout_name); - goto out; - } - - if (timeout->l3num != par->family) { - ret = -EINVAL; - pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", - timeout_name, 3, timeout->l3num); - goto err_put_timeout; + pr_info_ratelimited("You must specify a L4 protocol and not " + "use inversions on it"); + return -EINVAL; } - /* Make sure the timeout policy matches any existing protocol tracker, - * otherwise default to generic. - */ l4proto = nf_ct_l4proto_find(proto); - if (timeout->l4proto->l4proto != l4proto->l4proto) { - ret = -EINVAL; - pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", - timeout_name, 4, timeout->l4proto->l4proto); - goto err_put_timeout; - } - timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (!timeout_ext) { - ret = -ENOMEM; - goto err_put_timeout; - } + return nf_ct_set_timeout(par->net, ct, par->family, l4proto->l4proto, + timeout_name); - rcu_read_unlock(); - return ret; - -err_put_timeout: - __xt_ct_tg_timeout_put(timeout); -out: - rcu_read_unlock(); - if (errmsg) - pr_info_ratelimited("%s\n", errmsg); - return ret; #else return -EOPNOTSUPP; #endif @@ -328,26 +267,6 @@ static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par) return xt_ct_tg_check(par, par->targinfo); } -static void xt_ct_destroy_timeout(struct nf_conn *ct) -{ -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - struct nf_conn_timeout *timeout_ext; - typeof(nf_ct_timeout_put_hook) timeout_put; - - rcu_read_lock(); - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); - - if (timeout_put) { - timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) { - timeout_put(timeout_ext->timeout); - RCU_INIT_POINTER(timeout_ext->timeout, NULL); - } - } - rcu_read_unlock(); -#endif -} - static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct xt_ct_target_info_v1 *info) { @@ -361,7 +280,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, nf_ct_netns_put(par->net, par->family); - xt_ct_destroy_timeout(ct); + nf_ct_destroy_timeout(ct); nf_ct_put(info->ct); } } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c index fd3f9e8a74da..ece20d832adc 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/netfilter/xt_MASQUERADE.c @@ -9,20 +9,10 @@ * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/types.h> -#include <linux/inetdevice.h> -#include <linux/ip.h> -#include <linux/timer.h> #include <linux/module.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <net/ip.h> -#include <net/checksum.h> -#include <net/route.h> -#include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> -#include <net/netfilter/ipv4/nf_nat_masquerade.h> +#include <net/netfilter/nf_nat_masquerade.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -64,38 +54,78 @@ static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target masquerade_tg_reg __read_mostly = { - .name = "MASQUERADE", - .family = NFPROTO_IPV4, - .target = masquerade_tg, - .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), - .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, - .checkentry = masquerade_tg_check, - .destroy = masquerade_tg_destroy, - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IPV6) +static unsigned int +masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); +} + +static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + + if (range->flags & NF_NAT_RANGE_MAP_IPS) + return -EINVAL; + + return nf_ct_netns_get(par->net, par->family); +} +#endif + +static struct xt_target masquerade_tg_reg[] __read_mostly = { + { +#if IS_ENABLED(CONFIG_IPV6) + .name = "MASQUERADE", + .family = NFPROTO_IPV6, + .target = masquerade_tg6, + .targetsize = sizeof(struct nf_nat_range), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg6_checkentry, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + }, { +#endif + .name = "MASQUERADE", + .family = NFPROTO_IPV4, + .target = masquerade_tg, + .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), + .table = "nat", + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check, + .destroy = masquerade_tg_destroy, + .me = THIS_MODULE, + } }; static int __init masquerade_tg_init(void) { int ret; - ret = xt_register_target(&masquerade_tg_reg); + ret = xt_register_targets(masquerade_tg_reg, + ARRAY_SIZE(masquerade_tg_reg)); if (ret) return ret; - ret = nf_nat_masquerade_ipv4_register_notifier(); - if (ret) - xt_unregister_target(&masquerade_tg_reg); + ret = nf_nat_masquerade_inet_register_notifiers(); + if (ret) { + xt_unregister_targets(masquerade_tg_reg, + ARRAY_SIZE(masquerade_tg_reg)); + return ret; + } return ret; } static void __exit masquerade_tg_exit(void) { - xt_unregister_target(&masquerade_tg_reg); - nf_nat_masquerade_ipv4_unregister_notifier(); + xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); + nf_nat_masquerade_inet_unregister_notifiers(); } module_init(masquerade_tg_init); module_exit(masquerade_tg_exit); +#if IS_ENABLED(CONFIG_IPV6) +MODULE_ALIAS("ip6t_MASQUERADE"); +#endif +MODULE_ALIAS("ipt_MASQUERADE"); diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 4fa4efd24353..893374ac3758 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -15,7 +15,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); -MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); +MODULE_DESCRIPTION("Xtables: add/match connection tracking labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 8d86e39d6280..a30536b17ee1 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -288,8 +288,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, size = 16; } /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + - sizeof(struct hlist_head) * size); + hinfo = vmalloc(struct_size(hinfo, hash, size)); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index c13bcd0ab491..8dbb4d48f2ed 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) s64 stamp; /* - * We cannot use get_seconds() instead of __net_timestamp() here. + * We need real time here, but we can neither use skb->tstamp + * nor __net_timestamp(). + * + * skb->tstamp and skb->skb_mstamp_ns overlap, however, they + * use different clock types (real vs monotonic). + * * Suppose you have two rules: - * 1. match before 13:00 - * 2. match after 13:00 + * 1. match before 13:00 + * 2. match after 13:00 + * * If you match against processing time (get_seconds) it * may happen that the same packet matches both rules if - * it arrived at the right moment before 13:00. + * it arrived at the right moment before 13:00, so it would be + * better to check skb->tstamp and set it via __net_timestamp() + * if needed. This however breaks outgoing packets tx timestamp, + * and causes them to get delayed forever by fq packet scheduler. */ - if (skb->tstamp == 0) - __net_timestamp((struct sk_buff *)skb); - - stamp = ktime_to_ns(skb->tstamp); - stamp = div_s64(stamp, NSEC_PER_SEC); + stamp = get_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 4d748975117d..1de87172885d 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -321,29 +321,29 @@ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) static const struct genl_ops netlbl_calipso_ops[] = { { .cmd = NLBL_CALIPSO_C_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = calipso_genl_policy, .doit = netlbl_calipso_add, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_REMOVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = calipso_genl_policy, .doit = netlbl_calipso_remove, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LIST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = calipso_genl_policy, .doit = netlbl_calipso_list, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LISTALL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = calipso_genl_policy, .doit = NULL, .dumpit = netlbl_calipso_listall, }, @@ -354,6 +354,7 @@ static struct genl_family netlbl_calipso_gnl_family __ro_after_init = { .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CALIPSO_A_MAX, + .policy = calipso_genl_policy, .module = THIS_MODULE, .ops = netlbl_calipso_ops, .n_ops = ARRAY_SIZE(netlbl_calipso_ops), diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 9aacf2da3d98..5d1121981d0b 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -99,9 +99,10 @@ static int netlbl_cipsov4_add_common(struct genl_info *info, doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); - if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST], - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy, NULL) != 0) + if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_TAGLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) return -EINVAL; nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem) @@ -146,9 +147,10 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST]) return -EINVAL; - if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy, NULL) != 0) + if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); @@ -170,9 +172,10 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { - if (nla_validate_nested(nla_a, NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy, - NULL) != 0) + if (nla_validate_nested_deprecated(nla_a, + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { @@ -234,19 +237,20 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, } if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) { - if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy, NULL) != 0) + if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSCATLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { - if (nla_validate_nested(nla_a, - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy, - NULL) != 0) + if (nla_validate_nested_deprecated(nla_a, + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { @@ -498,7 +502,7 @@ list_start: if (ret_val != 0) goto list_failure_lock; - nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_TAGLST); + nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_TAGLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; @@ -517,7 +521,8 @@ list_start: switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: - nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST); + nla_a = nla_nest_start_noflag(ans_skb, + NLBL_CIPSOV4_A_MLSLVLLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; @@ -529,7 +534,8 @@ list_start: CIPSO_V4_INV_LVL) continue; - nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVL); + nla_b = nla_nest_start_noflag(ans_skb, + NLBL_CIPSOV4_A_MLSLVL); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; @@ -548,7 +554,8 @@ list_start: } nla_nest_end(ans_skb, nla_a); - nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCATLST); + nla_a = nla_nest_start_noflag(ans_skb, + NLBL_CIPSOV4_A_MLSCATLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_retry; @@ -560,7 +567,8 @@ list_start: CIPSO_V4_INV_CAT) continue; - nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCAT); + nla_b = nla_nest_start_noflag(ans_skb, + NLBL_CIPSOV4_A_MLSCAT); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; @@ -733,29 +741,29 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) static const struct genl_ops netlbl_cipsov4_ops[] = { { .cmd = NLBL_CIPSOV4_C_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_cipsov4_genl_policy, .doit = netlbl_cipsov4_add, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_REMOVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_cipsov4_genl_policy, .doit = netlbl_cipsov4_remove, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LIST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_cipsov4_genl_policy, .doit = netlbl_cipsov4_list, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LISTALL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_cipsov4_genl_policy, .doit = NULL, .dumpit = netlbl_cipsov4_listall, }, @@ -766,6 +774,7 @@ static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = { .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CIPSOV4_A_MAX, + .policy = netlbl_cipsov4_genl_policy, .module = THIS_MODULE, .ops = netlbl_cipsov4_ops, .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops), diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 21e0095b1d14..cae04f207782 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -315,7 +315,7 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: - nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST); + nla_a = nla_nest_start_noflag(skb, NLBL_MGMT_A_SELECTORLIST); if (nla_a == NULL) return -ENOMEM; @@ -323,7 +323,8 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, struct netlbl_domaddr4_map *map4; struct in_addr addr_struct; - nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + nla_b = nla_nest_start_noflag(skb, + NLBL_MGMT_A_ADDRSELECTOR); if (nla_b == NULL) return -ENOMEM; @@ -357,7 +358,8 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) { struct netlbl_domaddr6_map *map6; - nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR); + nla_b = nla_nest_start_noflag(skb, + NLBL_MGMT_A_ADDRSELECTOR); if (nla_b == NULL) return -ENOMEM; @@ -772,57 +774,57 @@ version_failure: static const struct genl_ops netlbl_mgmt_genl_ops[] = { { .cmd = NLBL_MGMT_C_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_add, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_REMOVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_remove, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_LISTALL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_mgmt_genl_policy, .doit = NULL, .dumpit = netlbl_mgmt_listall, }, { .cmd = NLBL_MGMT_C_ADDDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_adddef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_REMOVEDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_removedef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_LISTDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_listdef, .dumpit = NULL, }, { .cmd = NLBL_MGMT_C_PROTOCOLS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_mgmt_genl_policy, .doit = NULL, .dumpit = netlbl_mgmt_protocols, }, { .cmd = NLBL_MGMT_C_VERSION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_mgmt_genl_policy, .doit = netlbl_mgmt_version, .dumpit = NULL, }, @@ -833,6 +835,7 @@ static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = { .name = NETLBL_NLTYPE_MGMT_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_MGMT_A_MAX, + .policy = netlbl_mgmt_genl_policy, .module = THIS_MODULE, .ops = netlbl_mgmt_genl_ops, .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops), diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index c92894c3e40a..b87dd34e1835 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1317,57 +1317,57 @@ unlabel_staticlistdef_return: static const struct genl_ops netlbl_unlabel_genl_ops[] = { { .cmd = NLBL_UNLABEL_C_STATICADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_staticadd, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_staticremove, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLIST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_unlabel_genl_policy, .doit = NULL, .dumpit = netlbl_unlabel_staticlist, }, { .cmd = NLBL_UNLABEL_C_STATICADDDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_staticadddef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_staticremovedef, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_STATICLISTDEF, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_unlabel_genl_policy, .doit = NULL, .dumpit = netlbl_unlabel_staticlistdef, }, { .cmd = NLBL_UNLABEL_C_ACCEPT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_accept, .dumpit = NULL, }, { .cmd = NLBL_UNLABEL_C_LIST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, - .policy = netlbl_unlabel_genl_policy, .doit = netlbl_unlabel_list, .dumpit = NULL, }, @@ -1378,6 +1378,7 @@ static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_UNLABEL_A_MAX, + .policy = netlbl_unlabel_genl_policy, .module = THIS_MODULE, .ops = netlbl_unlabel_genl_ops, .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f28e937320a3..216ab915dd54 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -988,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; - unsigned long groups = nladdr->nl_groups; + unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) @@ -996,6 +996,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nladdr->nl_family != AF_NETLINK) return -EINVAL; + groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f0ec068e1d02..efccd1ac9a66 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -362,8 +362,8 @@ int genl_register_family(struct genl_family *family) } else family->attrbuf = NULL; - family->id = idr_alloc(&genl_fam_idr, family, - start, end + 1, GFP_KERNEL); + family->id = idr_alloc_cyclic(&genl_fam_idr, family, + start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_free; @@ -536,6 +536,28 @@ static int genl_family_rcv_msg(const struct genl_family *family, if (ops->dumpit == NULL) return -EOPNOTSUPP; + if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) { + int hdrlen = GENL_HDRLEN + family->hdrsize; + + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + return -EINVAL; + + if (family->maxattr) { + unsigned int validate = NL_VALIDATE_STRICT; + + if (ops->validate & + GENL_DONT_VALIDATE_DUMP_STRICT) + validate = NL_VALIDATE_LIBERAL; + rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), + family->maxattr, + family->policy, + validate, extack); + if (rc) + return rc; + } + } + if (!family->parallel_ops) { struct netlink_dump_control c = { .module = family->module, @@ -577,8 +599,13 @@ static int genl_family_rcv_msg(const struct genl_family *family, attrbuf = family->attrbuf; if (attrbuf) { - err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - ops->policy, extack); + enum netlink_validation validate = NL_VALIDATE_STRICT; + + if (ops->validate & GENL_DONT_VALIDATE_STRICT) + validate = NL_VALIDATE_LIBERAL; + + err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, + family->policy, validate, extack); if (err < 0) goto out; } @@ -665,7 +692,7 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, struct nlattr *nla_ops; int i; - nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS); + nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; @@ -678,10 +705,10 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, op_flags |= GENL_CMD_CAP_DUMP; if (ops->doit) op_flags |= GENL_CMD_CAP_DO; - if (ops->policy) + if (family->policy) op_flags |= GENL_CMD_CAP_HASPOL; - nest = nla_nest_start(skb, i + 1); + nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; @@ -699,7 +726,7 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, struct nlattr *nla_grps; int i; - nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); + nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; @@ -709,7 +736,7 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, grp = &family->mcgrps[i]; - nest = nla_nest_start(skb, i + 1); + nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; @@ -749,11 +776,11 @@ static int ctrl_fill_mcgrp_info(const struct genl_family *family, nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) goto nla_put_failure; - nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS); + nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; - nest = nla_nest_start(skb, 1); + nest = nla_nest_start_noflag(skb, 1); if (nest == NULL) goto nla_put_failure; @@ -938,9 +965,9 @@ static int genl_ctrl_event(int event, const struct genl_family *family, static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, - .policy = ctrl_policy, }, }; @@ -958,6 +985,7 @@ static struct genl_family genl_ctrl __ro_after_init = { .name = "nlctrl", .version = 0x2, .maxattr = CTRL_ATTR_MAX, + .policy = ctrl_policy, .netnsok = true, }; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1d3144d19903..167c09e1ea90 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1199,7 +1199,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; - int ret; switch (cmd) { case TIOCOUTQ: { @@ -1225,18 +1224,6 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return put_user(amount, (int __user *)argp); } - case SIOCGSTAMP: - lock_sock(sk); - ret = sock_get_timestamp(sk, argp); - release_sock(sk); - return ret; - - case SIOCGSTAMPNS: - lock_sock(sk); - ret = sock_get_timestampns(sk, argp); - release_sock(sk); - return ret; - case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1362,6 +1349,7 @@ static const struct proto_ops nr_proto_ops = { .getname = nr_getname, .poll = datagram_poll, .ioctl = nr_ioctl, + .gettstamp = sock_gettstamp, .listen = nr_listen, .shutdown = sock_no_shutdown, .setsockopt = nr_setsockopt, @@ -1392,18 +1380,22 @@ static int __init nr_proto_init(void) int i; int rc = proto_register(&nr_proto, 0); - if (rc != 0) - goto out; + if (rc) + return rc; if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n"); - return -1; + pr_err("NET/ROM: %s - nr_ndevs parameter too large\n", + __func__); + rc = -EINVAL; + goto unregister_proto; } dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); - if (dev_nr == NULL) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n"); - return -1; + if (!dev_nr) { + pr_err("NET/ROM: %s - unable to allocate device array\n", + __func__); + rc = -ENOMEM; + goto unregister_proto; } for (i = 0; i < nr_ndevs; i++) { @@ -1413,13 +1405,13 @@ static int __init nr_proto_init(void) sprintf(name, "nr%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup); if (!dev) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); + rc = -ENOMEM; goto fail; } dev->base_addr = i; - if (register_netdev(dev)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n"); + rc = register_netdev(dev); + if (rc) { free_netdev(dev); goto fail; } @@ -1427,36 +1419,64 @@ static int __init nr_proto_init(void) dev_nr[i] = dev; } - if (sock_register(&nr_family_ops)) { - printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n"); + rc = sock_register(&nr_family_ops); + if (rc) goto fail; - } - register_netdevice_notifier(&nr_dev_notifier); + rc = register_netdevice_notifier(&nr_dev_notifier); + if (rc) + goto out_sock; ax25_register_pid(&nr_pid); ax25_linkfail_register(&nr_linkfail_notifier); #ifdef CONFIG_SYSCTL - nr_register_sysctl(); + rc = nr_register_sysctl(); + if (rc) + goto out_sysctl; #endif nr_loopback_init(); - proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops); - proc_create_seq("nr_neigh", 0444, init_net.proc_net, &nr_neigh_seqops); - proc_create_seq("nr_nodes", 0444, init_net.proc_net, &nr_node_seqops); -out: - return rc; + rc = -ENOMEM; + if (!proc_create_seq("nr", 0444, init_net.proc_net, &nr_info_seqops)) + goto proc_remove1; + if (!proc_create_seq("nr_neigh", 0444, init_net.proc_net, + &nr_neigh_seqops)) + goto proc_remove2; + if (!proc_create_seq("nr_nodes", 0444, init_net.proc_net, + &nr_node_seqops)) + goto proc_remove3; + + return 0; + +proc_remove3: + remove_proc_entry("nr_neigh", init_net.proc_net); +proc_remove2: + remove_proc_entry("nr", init_net.proc_net); +proc_remove1: + + nr_loopback_clear(); + nr_rt_free(); + +#ifdef CONFIG_SYSCTL + nr_unregister_sysctl(); +out_sysctl: +#endif + ax25_linkfail_release(&nr_linkfail_notifier); + ax25_protocol_release(AX25_P_NETROM); + unregister_netdevice_notifier(&nr_dev_notifier); +out_sock: + sock_unregister(PF_NETROM); fail: while (--i >= 0) { unregister_netdev(dev_nr[i]); free_netdev(dev_nr[i]); } kfree(dev_nr); +unregister_proto: proto_unregister(&nr_proto); - rc = -1; - goto out; + return rc; } module_init(nr_proto_init); diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 215ad22a9647..93d13f019981 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -70,7 +70,7 @@ static void nr_loopback_timer(struct timer_list *unused) } } -void __exit nr_loopback_clear(void) +void nr_loopback_clear(void) { del_timer_sync(&loopback_timer); skb_queue_purge(&loopback_queue); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 6485f593e2f0..b76aa668a94b 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -953,7 +953,7 @@ const struct seq_operations nr_neigh_seqops = { /* * Free all memory associated with the nodes and routes lists. */ -void __exit nr_rt_free(void) +void nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index ba1c368b3f18..771011b84270 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -146,9 +146,12 @@ static struct ctl_table nr_table[] = { { } }; -void __init nr_register_sysctl(void) +int __init nr_register_sysctl(void) { nr_table_header = register_net_sysctl(&init_net, "net/netrom", nr_table); + if (!nr_table_header) + return -ENOMEM; + return 0; } void nr_unregister_sysctl(void) diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index ddfc52ac1f9b..c0d323b58e73 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -312,6 +312,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; + if (new_pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id @@ -336,6 +340,10 @@ static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { + status = NCI_HCI_ANY_E_NOK; + goto exit; + } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 376181cc1def..04a8e47674ec 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -119,9 +119,10 @@ static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) int rc; u32 idx; - rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, nfc_genl_policy, - NULL); + rc = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nfc_genl_family.hdrsize, + attrbuf, nfc_genl_family.maxattr, + nfc_genl_policy, NULL); if (rc < 0) return ERR_PTR(rc); @@ -392,7 +393,7 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; - sdp_attr = nla_nest_start(msg, NFC_ATTR_LLC_SDP); + sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; @@ -402,7 +403,7 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); - uri_attr = nla_nest_start(msg, i++); + uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; @@ -1177,8 +1178,9 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { - rc = nla_parse_nested(sdp_attrs, NFC_SDP_ATTR_MAX, attr, - nfc_sdp_genl_policy, info->extack); + rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, + attr, nfc_sdp_genl_policy, + info->extack); if (rc != 0) { rc = -EINVAL; @@ -1667,102 +1669,102 @@ EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DEV_UP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DEV_DOWN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_START_POLL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_STOP_POLL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DEP_LINK_UP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_GET_TARGET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_LLC_SDREQ, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_FW_DOWNLOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_ENABLE_SE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DISABLE_SE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_GET_SE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_SE_IO, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_VENDOR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, - .policy = nfc_genl_policy, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, - .policy = nfc_genl_policy, }, }; @@ -1771,6 +1773,7 @@ static struct genl_family nfc_genl_family __ro_after_init = { .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, + .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e47ebbbe71b8..2c151bb322c1 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -169,6 +169,10 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, const struct nlattr *actions, int len, bool last, bool clone_flow_key); +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len); + static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) { @@ -1213,6 +1217,40 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true); } +static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, bool last) +{ + const struct nlattr *actions, *cpl_arg; + const struct check_pkt_len_arg *arg; + int rem = nla_len(attr); + bool clone_flow_key; + + /* The first netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + if (skb->len <= arg->pkt_len) { + /* Second netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + actions = nla_next(cpl_arg, &rem); + clone_flow_key = !arg->exec_for_lesser_equal; + } else { + /* Third netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER'. + */ + actions = nla_next(cpl_arg, &rem); + actions = nla_next(actions, &rem); + clone_flow_key = !arg->exec_for_greater; + } + + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, clone_flow_key); +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -1374,6 +1412,16 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; } + + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = execute_check_pkt_len(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 1b6896896fff..4c597a0bb168 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -24,11 +24,12 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/ipv6_frag.h> -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #endif @@ -73,7 +74,8 @@ struct ovs_conntrack_info { u32 eventmask; /* Mask of 1 << IPCT_*. */ struct md_mark mark; struct md_labels labels; -#ifdef CONFIG_NF_NAT_NEEDED + char timeout[CTNL_TIMEOUT_NAME_MAX]; +#if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; @@ -719,7 +721,7 @@ static bool skb_nfct_cached(struct net *net, return ct_executed; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. * Returns either NF_ACCEPT or NF_DROP. @@ -901,7 +903,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, return err; } -#else /* !CONFIG_NF_NAT_NEEDED */ +#else /* !CONFIG_NF_NAT */ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb, struct nf_conn *ct, @@ -990,6 +992,12 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, GFP_ATOMIC); if (err) return err; + + /* helper installed, add seqadj if NAT is required */ + if (info->nat && !nfct_seqadj(ct)) { + if (!nfct_seqadj_ext_add(ct)) + return -EINVAL; + } } /* Call the helper only if: @@ -1299,6 +1307,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, { struct nf_conntrack_helper *helper; struct nf_conn_help *help; + int ret = 0; helper = nf_conntrack_helper_try_module_get(name, info->family, key->ip.proto); @@ -1313,16 +1322,24 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return -ENOMEM; } +#if IS_ENABLED(CONFIG_NF_NAT) + if (info->nat) { + ret = nf_nat_helper_try_module_get(name, info->family, + key->ip.proto); + if (ret) { + nf_conntrack_helper_put(helper); + OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d", + name, ret); + return ret; + } + } +#endif rcu_assign_pointer(help->helper, helper); info->helper = helper; - - if (info->nat) - request_module("ip_nat_%s", name); - - return 0; + return ret; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static int parse_nat(const struct nlattr *attr, struct ovs_conntrack_info *info, bool log) { @@ -1459,12 +1476,14 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, .maxlen = NF_CT_HELPER_NAME_LEN }, -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) /* NAT length is checked when parsing the nested attributes. */ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, #endif [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, + .maxlen = CTNL_TIMEOUT_NAME_MAX }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -1537,7 +1556,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return -EINVAL; } break; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) case OVS_CT_ATTR_NAT: { int err = parse_nat(a, info, log); @@ -1550,6 +1569,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, info->have_eventmask = true; info->eventmask = nla_get_u32(a); break; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + case OVS_CT_ATTR_TIMEOUT: + memcpy(info->timeout, nla_data(a), nla_len(a)); + if (!memchr(info->timeout, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; +#endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", @@ -1631,6 +1659,14 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } + + if (ct_info.timeout[0]) { + if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, + ct_info.timeout)) + pr_info_ratelimited("Failed to associated timeout " + "policy `%s'\n", ct_info.timeout); + } + if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1650,13 +1686,13 @@ err_free_ct: return err; } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, struct sk_buff *skb) { struct nlattr *start; - start = nla_nest_start(skb, OVS_CT_ATTR_NAT); + start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT); if (!start) return false; @@ -1723,7 +1759,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, { struct nlattr *start; - start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT); if (!start) return -EMSGSIZE; @@ -1751,8 +1787,12 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (ct_info->have_eventmask && nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) return -EMSGSIZE; + if (ct_info->timeout[0]) { + if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) + return -EMSGSIZE; + } -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) return -EMSGSIZE; #endif @@ -1770,10 +1810,18 @@ void ovs_ct_free_action(const struct nlattr *a) static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) { - if (ct_info->helper) + if (ct_info->helper) { +#if IS_ENABLED(CONFIG_NF_NAT) + if (ct_info->nat) + nf_nat_helper_put(ct_info->helper); +#endif nf_conntrack_helper_put(ct_info->helper); - if (ct_info->ct) + } + if (ct_info->ct) { + if (ct_info->timeout[0]) + nf_ct_destroy_timeout(ct_info->ct); nf_ct_tmpl_free(ct_info->ct); + } } #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) @@ -2126,7 +2174,11 @@ static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(reply)) return PTR_ERR(reply); - nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); + nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); + if (!nla_reply) { + err = -EMSGSIZE; + goto exit_err; + } if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { err = ovs_ct_limit_get_zone_limit( @@ -2152,20 +2204,20 @@ exit_err: static struct genl_ops ct_limit_genl_ops[] = { { .cmd = OVS_CT_LIMIT_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ - .policy = ct_limit_policy, .doit = ovs_ct_limit_cmd_set, }, { .cmd = OVS_CT_LIMIT_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ - .policy = ct_limit_policy, .doit = ovs_ct_limit_cmd_del, }, { .cmd = OVS_CT_LIMIT_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = ct_limit_policy, .doit = ovs_ct_limit_cmd_get, }, }; @@ -2179,6 +2231,7 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = { .name = OVS_CT_LIMIT_FAMILY, .version = OVS_CT_LIMIT_VERSION, .maxattr = OVS_CT_LIMIT_ATTR_MAX, + .policy = ct_limit_policy, .netnsok = true, .parallel_ops = true, .ops = ct_limit_genl_ops, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9dd158ab51b3..dc9ff9367221 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -455,7 +455,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); - BUG_ON(err); + if (err) + goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, @@ -463,19 +464,22 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + nla = nla_nest_start_noflag(user_skb, + OVS_PACKET_ATTR_EGRESS_TUN_KEY); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); - BUG_ON(err); + if (err) + goto out; + nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); if (!nla) { err = -EMSGSIZE; goto out; @@ -638,8 +642,8 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { static const struct genl_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = packet_policy, .doit = ovs_packet_cmd_execute } }; @@ -649,6 +653,7 @@ static struct genl_family dp_packet_genl_family __ro_after_init = { .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, + .policy = packet_policy, .netnsok = true, .parallel_ops = true, .ops = dp_packet_genl_ops, @@ -776,7 +781,7 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, * This can only fail for dump operations because the skb is always * properly sized for single flows. */ - start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { const struct sw_flow_actions *sf_acts; @@ -1374,8 +1379,8 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) u32 ufid_flags; int err; - err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, - OVS_FLOW_ATTR_MAX, flow_policy, NULL); + err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -1423,24 +1428,24 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { static const struct genl_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = flow_policy, .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_set, }, }; @@ -1450,6 +1455,7 @@ static struct genl_family dp_flow_genl_family __ro_after_init = { .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, + .policy = flow_policy, .netnsok = true, .parallel_ops = true, .ops = dp_flow_genl_ops, @@ -1816,24 +1822,24 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { static const struct genl_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, }; @@ -1843,6 +1849,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, + .policy = datapath_policy, .netnsok = true, .parallel_ops = true, .ops = dp_datapath_genl_ops, @@ -2259,24 +2266,24 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { static const struct genl_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = vport_policy, .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_set, }, }; @@ -2286,6 +2293,7 @@ struct genl_family dp_vport_genl_family __ro_after_init = { .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, + .policy = vport_policy, .netnsok = true, .parallel_ops = true, .ops = dp_vport_genl_ops, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 691da853bef5..54eb80dd2dc6 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -91,6 +91,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_METER: + case OVS_ACTION_ATTR_CHECK_PKT_LEN: default: return true; } @@ -403,6 +404,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE] = { .len = 0 }, }; static const struct ovs_len_tbl @@ -666,6 +668,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, bool log) { bool ttl = false, ipv4 = false, ipv6 = false; + bool info_bridge_mode = false; __be16 tun_flags = 0; int opts_type = 0; struct nlattr *a; @@ -782,6 +785,10 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_ERSPAN_OPT; opts_type = type; break; + case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE: + info_bridge_mode = true; + ipv4 = true; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -812,16 +819,29 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, OVS_NLERR(log, "IP tunnel dst address not specified"); return -EINVAL; } - if (ipv4 && !match->key->tun_key.u.ipv4.dst) { - OVS_NLERR(log, "IPv4 tunnel dst address is zero"); - return -EINVAL; + if (ipv4) { + if (info_bridge_mode) { + if (match->key->tun_key.u.ipv4.src || + match->key->tun_key.u.ipv4.dst || + match->key->tun_key.tp_src || + match->key->tun_key.tp_dst || + match->key->tun_key.ttl || + match->key->tun_key.tos || + tun_flags & ~TUNNEL_KEY) { + OVS_NLERR(log, "IPv4 tun info is not correct"); + return -EINVAL; + } + } else if (!match->key->tun_key.u.ipv4.dst) { + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); + return -EINVAL; + } } if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { OVS_NLERR(log, "IPv6 tunnel dst address is zero"); return -EINVAL; } - if (!ttl) { + if (!ttl && !info_bridge_mode) { OVS_NLERR(log, "IP tunnel TTL not specified."); return -EINVAL; } @@ -836,7 +856,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; - nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + nla = nla_nest_start_noflag(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); if (!nla) return -EMSGSIZE; @@ -850,12 +870,17 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, static int __ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) + unsigned short tun_proto, u8 mode) { if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, OVS_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; + + if (mode & IP_TUNNEL_INFO_BRIDGE) + return nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE) + ? -EMSGSIZE : 0; + switch (tun_proto) { case AF_INET: if (output->u.ipv4.src && @@ -918,17 +943,17 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, static int ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) + unsigned short tun_proto, u8 mode) { struct nlattr *nla; int err; - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + nla = nla_nest_start_noflag(skb, OVS_KEY_ATTR_TUNNEL); if (!nla) return -EMSGSIZE; err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, - tun_proto); + tun_proto, mode); if (err) return err; @@ -942,7 +967,7 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb, return __ip_tun_to_nlattr(skb, &tun_info->key, ip_tunnel_info_opts(tun_info), tun_info->options_len, - ip_tunnel_info_af(tun_info)); + ip_tunnel_info_af(tun_info), tun_info->mode); } static int encode_vlan_from_nlattrs(struct sw_flow_match *match, @@ -1932,7 +1957,7 @@ static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, { struct nlattr *start; - start = nla_nest_start(skb, OVS_KEY_ATTR_NSH); + start = nla_nest_start_noflag(skb, OVS_KEY_ATTR_NSH); if (!start) return -EMSGSIZE; @@ -1980,7 +2005,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); if (ip_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len, swkey->tun_proto)) + swkey->tun_opts_len, swkey->tun_proto, 0)) goto nla_put_failure; } @@ -2015,14 +2040,15 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) goto nla_put_failure; - encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + encap = nla_nest_start_noflag(skb, OVS_KEY_ATTR_ENCAP); if (!swkey->eth.vlan.tci) goto unencap; if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) goto nla_put_failure; - in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + in_encap = nla_nest_start_noflag(skb, + OVS_KEY_ATTR_ENCAP); if (!swkey->eth.cvlan.tci) goto unencap; } @@ -2201,7 +2227,7 @@ int ovs_nla_put_key(const struct sw_flow_key *swkey, int err; struct nlattr *nla; - nla = nla_nest_start(skb, attr); + nla = nla_nest_start_noflag(skb, attr); if (!nla) return -EMSGSIZE; err = __ovs_nla_put_key(swkey, output, is_mask, skb); @@ -2306,14 +2332,14 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, struct sw_flow_actions *acts; int new_acts_size; - int req_size = NLA_ALIGN(attr_len); + size_t req_size = NLA_ALIGN(attr_len); int next_offset = offsetof(struct sw_flow_actions, actions) + (*sfa)->actions_len; if (req_size <= (ksize(*sfa) - next_offset)) goto out; - new_acts_size = ksize(*sfa) * 2; + new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); if (new_acts_size > MAX_ACTIONS_BUFSIZE) { if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { @@ -2605,6 +2631,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, tun_info->mode = IP_TUNNEL_INFO_TX; if (key.tun_proto == AF_INET6) tun_info->mode |= IP_TUNNEL_INFO_IPV6; + else if (key.tun_proto == AF_INET && key.tun_key.u.ipv4.dst == 0) + tun_info->mode |= IP_TUNNEL_INFO_BRIDGE; tun_info->key = key.tun_key; /* We need to store the options in the action itself since @@ -2826,8 +2854,8 @@ static int validate_userspace(const struct nlattr *attr) struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; - error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, attr, - userspace_policy, NULL); + error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr, + userspace_policy, NULL); if (error) return error; @@ -2838,6 +2866,88 @@ static int validate_userspace(const struct nlattr *attr) return 0; } +static const struct nla_policy cpl_policy[OVS_CHECK_PKT_LEN_ATTR_MAX + 1] = { + [OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] = {.type = NLA_U16 }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER] = {.type = NLA_NESTED }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL] = {.type = NLA_NESTED }, +}; + +static int validate_and_copy_check_pkt_len(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + const struct nlattr *acts_if_greater, *acts_if_lesser_eq; + struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1]; + struct check_pkt_len_arg arg; + int nested_acts_start; + int start, err; + + err = nla_parse_deprecated_strict(a, OVS_CHECK_PKT_LEN_ATTR_MAX, + nla_data(attr), nla_len(attr), + cpl_policy, NULL); + if (err) + return err; + + if (!a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] || + !nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN])) + return -EINVAL; + + acts_if_lesser_eq = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL]; + acts_if_greater = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER]; + + /* Both the nested action should be present. */ + if (!acts_if_greater || !acts_if_lesser_eq) + return -EINVAL; + + /* validation done, copy the nested actions. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CHECK_PKT_LEN, + log); + if (start < 0) + return start; + + arg.pkt_len = nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]); + arg.exec_for_lesser_equal = + last || !actions_may_change_flow(acts_if_lesser_eq); + arg.exec_for_greater = + last || !actions_may_change_flow(acts_if_greater); + + err = ovs_nla_add_action(sfa, OVS_CHECK_PKT_LEN_ATTR_ARG, &arg, + sizeof(arg), log); + if (err) + return err; + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, + eth_type, vlan_tci, log); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, + eth_type, vlan_tci, log); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + add_nested_action_end(*sfa, start); + return 0; +} + static int copy_action(const struct nlattr *from, struct sw_flow_actions **sfa, bool log) { @@ -2884,6 +2994,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_CLONE] = (u32)-1, + [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3085,6 +3196,19 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, break; } + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_check_pkt_len(net, a, key, sfa, + eth_type, + vlan_tci, log, + last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3130,7 +3254,7 @@ static int sample_action_to_attr(const struct nlattr *attr, const struct sample_arg *arg; struct nlattr *actions; - start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SAMPLE); if (!start) return -EMSGSIZE; @@ -3143,7 +3267,7 @@ static int sample_action_to_attr(const struct nlattr *attr, goto out; } - ac_start = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS); if (!ac_start) { err = -EMSGSIZE; goto out; @@ -3169,7 +3293,7 @@ static int clone_action_to_attr(const struct nlattr *attr, struct nlattr *start; int err = 0, rem = nla_len(attr); - start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE); + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CLONE); if (!start) return -EMSGSIZE; @@ -3183,6 +3307,75 @@ static int clone_action_to_attr(const struct nlattr *attr, return err; } +static int check_pkt_len_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *ac_start = NULL; + const struct check_pkt_len_arg *arg; + const struct nlattr *a, *cpl_arg; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CHECK_PKT_LEN); + if (!start) + return -EMSGSIZE; + + /* The first nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + if (nla_put_u16(skb, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, arg->pkt_len)) { + err = -EMSGSIZE; + goto out; + } + + /* Second nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + a = nla_next(cpl_arg, &rem); + ac_start = nla_nest_start_noflag(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + /* Third nested attribute in 'attr' is always + * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER. + */ + a = nla_next(a, &rem); + ac_start = nla_nest_start_noflag(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3195,14 +3388,14 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; - start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; err = ip_tun_to_nlattr(skb, &tun_info->key, ip_tunnel_info_opts(tun_info), tun_info->options_len, - ip_tunnel_info_af(tun_info)); + ip_tunnel_info_af(tun_info), tun_info->mode); if (err) return err; nla_nest_end(skb, start); @@ -3227,7 +3420,7 @@ static int masked_set_action_to_set_action_attr(const struct nlattr *a, /* Revert the conversion we did from a non-masked set action to * masked set action. */ - nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + nla = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); if (!nla) return -EMSGSIZE; @@ -3277,6 +3470,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + err = check_pkt_len_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 43849d752a1e..bb67238f0340 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -127,7 +127,7 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, OVS_METER_ATTR_PAD)) goto error; - nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto error; @@ -136,7 +136,7 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, for (i = 0; i < meter->n_bands; ++i, ++band) { struct nlattr *band_nla; - band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, sizeof(struct ovs_flow_stats), &band->stats)) @@ -166,11 +166,11 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; - nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto nla_put_failure; - band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla) goto nla_put_failure; /* Currently only DROP band type is supported. */ @@ -227,9 +227,9 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; u32 band_max_delta_t; - err = nla_parse((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, - nla_data(nla), nla_len(nla), band_policy, - NULL); + err = nla_parse_deprecated((struct nlattr **)&attr, + OVS_BAND_ATTR_MAX, nla_data(nla), + nla_len(nla), band_policy, NULL); if (err) goto exit_free_meter; @@ -526,27 +526,27 @@ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, static struct genl_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = meter_policy, .doit = ovs_meter_cmd_features }, { .cmd = OVS_METER_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ - .policy = meter_policy, .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = meter_policy, .doit = ovs_meter_cmd_get, }, { .cmd = OVS_METER_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ - .policy = meter_policy, .doit = ovs_meter_cmd_del }, }; @@ -560,6 +560,7 @@ struct genl_family dp_meter_genl_family __ro_after_init = { .name = OVS_METER_FAMILY, .version = OVS_METER_VERSION, .maxattr = OVS_METER_ATTR_MAX, + .policy = meter_policy, .netnsok = true, .parallel_ops = true, .ops = dp_meter_genl_ops, diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 8f16f11f7ad3..f3c54871f9e1 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -43,7 +43,7 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) if (vxlan->cfg.flags & VXLAN_F_GBP) { struct nlattr *exts; - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + exts = nla_nest_start_noflag(skb, OVS_TUNNEL_ATTR_EXTENSION); if (!exts) return -EMSGSIZE; @@ -70,8 +70,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, if (nla_len(attr) < sizeof(struct nlattr)) return -EINVAL; - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy, - NULL); + err = nla_parse_nested_deprecated(exts, OVS_VXLAN_EXT_MAX, attr, + exts_policy, NULL); if (err < 0) return err; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 19f6765566e7..258ce3b7b452 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -319,7 +319,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) if (!vport->ops->get_options) return 0; - nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9419c5cf4de5..fbc775fbf712 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,24 +275,22 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); -} - static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; + int cpu = raw_smp_processor_id(); u16 queue_index; +#ifdef CONFIG_XPS + skb->sender_cpu = cpu + 1; +#endif + skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { - queue_index = ops->ndo_select_queue(dev, skb, NULL, - __packet_pick_tx_queue); + queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb, NULL); + queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; @@ -2602,8 +2600,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); + unsigned char *addr = NULL; int tp_len, size_max; - unsigned char *addr; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; @@ -2614,7 +2612,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = po->num; - addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2624,10 +2621,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); - if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out_put; + if (po->sk.sk_socket->type == SOCK_DGRAM) { + if (dev && msg->msg_namelen < dev->addr_len + + offsetof(struct sockaddr_ll, sll_addr)) + goto out_put; + addr = saddr->sll_addr; + } } err = -ENXIO; @@ -2799,7 +2799,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; struct net_device *dev; __be16 proto; - unsigned char *addr; + unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; @@ -2816,7 +2816,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = po->num; - addr = NULL; } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2824,10 +2823,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; - addr = saddr->sll_halen ? saddr->sll_addr : NULL; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); - if (addr && dev && saddr->sll_halen < dev->addr_len) - goto out_unlock; + if (sock->type == SOCK_DGRAM) { + if (dev && msg->msg_namelen < dev->addr_len + + offsetof(struct sockaddr_ll, sll_addr)) + goto out_unlock; + addr = saddr->sll_addr; + } } err = -ENXIO; @@ -3344,20 +3346,29 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { + int copy_len; + /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); + copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); + copy_len = msg->msg_namelen; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { + memset(msg->msg_name + + offsetof(struct sockaddr_ll, sll_addr), + 0, sizeof(sll->sll_addr)); + msg->msg_namelen = sizeof(struct sockaddr_ll); + } } - memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, - msg->msg_namelen); + memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (pkt_sk(sk)->auxdata) { @@ -4077,11 +4088,6 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *)arg); - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *)arg); - #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: @@ -4457,6 +4463,7 @@ static const struct proto_ops packet_ops_spkt = { .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, @@ -4478,6 +4485,7 @@ static const struct proto_ops packet_ops = { .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, + .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, @@ -4590,14 +4598,29 @@ static void __exit packet_exit(void) static int __init packet_init(void) { - int rc = proto_register(&packet_proto, 0); + int rc; - if (rc != 0) + rc = proto_register(&packet_proto, 0); + if (rc) goto out; + rc = sock_register(&packet_family_ops); + if (rc) + goto out_proto; + rc = register_pernet_subsys(&packet_net_ops); + if (rc) + goto out_sock; + rc = register_netdevice_notifier(&packet_netdev_notifier); + if (rc) + goto out_pernet; + + return 0; - sock_register(&packet_family_ops); - register_pernet_subsys(&packet_net_ops); - register_netdevice_notifier(&packet_netdev_notifier); +out_pernet: + unregister_pernet_subsys(&packet_net_ops); +out_sock: + sock_unregister(PF_PACKET); +out_proto: + proto_unregister(&packet_proto); out: return rc; } diff --git a/net/packet/diag.c b/net/packet/diag.c index 7ef1c881ae74..98abfd8644a4 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -39,7 +39,7 @@ static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb) struct nlattr *mca; struct packet_mclist *ml; - mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST); + mca = nla_nest_start_noflag(nlskb, PACKET_DIAG_MCLIST); if (!mca) return -EMSGSIZE; diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 871eaf2cb85e..be92d936b5d5 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -79,8 +79,8 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_phonet_policy, extack); if (err < 0) return err; @@ -246,8 +246,8 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_phonet_policy, extack); if (err < 0) return err; diff --git a/net/psample/psample.c b/net/psample/psample.c index 64f95624f219..a107b2405668 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -100,6 +100,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg, static const struct genl_ops psample_nl_ops[] = { { .cmd = PSAMPLE_CMD_GET_GROUP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = psample_nl_cmd_get_group_dumpit, /* can be retrieved by unprivileged users */ } diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index b37e6e0a1026..801872a2e7aa 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -728,12 +728,13 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, struct sockaddr_qrtr *, struct sockaddr_qrtr *); + __le32 qrtr_type = cpu_to_le32(QRTR_TYPE_DATA); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; struct sk_buff *skb; + u32 type = 0; size_t plen; - u32 type = QRTR_TYPE_DATA; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) @@ -807,8 +808,8 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } /* control messages already require the type as 'command' */ - skb_copy_bits(skb, 0, &type, 4); - type = le32_to_cpu(type); + skb_copy_bits(skb, 0, &qrtr_type, 4); + type = le32_to_cpu(qrtr_type); } rc = enqueue_fn(node, skb, type, &ipc->us, addr); @@ -968,9 +969,6 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } break; - case SIOCGSTAMP: - rc = sock_get_timestamp(sk, argp); - break; case SIOCADDRT: case SIOCDELRT: case SIOCSIFADDR: @@ -1033,6 +1031,7 @@ static const struct proto_ops qrtr_proto_ops = { .recvmsg = qrtr_recvmsg, .getname = qrtr_getname, .ioctl = qrtr_ioctl, + .gettstamp = sock_gettstamp, .poll = datagram_poll, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, @@ -1093,7 +1092,8 @@ static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy, extack); + rc = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, + qrtr_policy, extack); if (rc < 0) return rc; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index d6cc97fbbbb0..2b969f99ef13 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -543,6 +543,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; + lock_sock(sk); switch (uaddr->sa_family) { diff --git a/net/rds/bind.c b/net/rds/bind.c index 17c9d9f0c848..0f4398e7f2a7 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -173,6 +173,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ + if (addr_len < offsetofend(struct sockaddr, sa_family)) + return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 31cf37da4510..93c0437e6a5f 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -44,6 +44,17 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) else pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } + ibmr = rds_ib_try_reuse_ibmr(pool); if (ibmr) return ibmr; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 63c8d107adcf..d664e9ade74d 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -454,9 +454,6 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) struct rds_ib_mr *ibmr = NULL; int iter = 0; - if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) - queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); - while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 70559854837e..8946c89d7392 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -772,7 +772,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, unsigned long frag_off; unsigned long to_copy; unsigned long copied; - uint64_t uncongested = 0; + __le64 uncongested = 0; void *addr; /* catch completely corrupt packets */ @@ -789,7 +789,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, copied = 0; while (copied < RDS_CONG_MAP_BYTES) { - uint64_t *src, *dst; + __le64 *src, *dst; unsigned int k; to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); @@ -824,9 +824,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, } /* the congestion map is in little endian order */ - uncongested = le64_to_cpu(uncongested); - - rds_cong_map_updated(map, uncongested); + rds_cong_map_updated(map, le64_to_cpu(uncongested)); } static void rds_ib_process_recv(struct rds_connection *conn, diff --git a/net/rds/info.c b/net/rds/info.c index e367a97a18c8..03f6fd56d237 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, 1, pages); + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 182ab8430594..b340ed4fc43a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, + pages); if (ret >= 0 && ret < nr_pages) { while (ret--) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index fd2694174607..66121bc6f34e 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -551,7 +551,7 @@ static __net_init int rds_tcp_init_net(struct net *net) tbl = kmemdup(rds_tcp_sysctl_table, sizeof(rds_tcp_sysctl_table), GFP_KERNEL); if (!tbl) { - pr_warn("could not set allocate syctl table\n"); + pr_warn("could not set allocate sysctl table\n"); return -ENOMEM; } rtn->ctl_table = tbl; @@ -608,7 +608,7 @@ static void rds_tcp_kill_sock(struct net *net) list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); - if (net != c_net || !tc->t_sock) + if (net != c_net) continue; if (!list_has_conn(&tmp_list, tc->t_cpath->cp_conn)) { list_move_tail(&tc->t_tcp_node, &tmp_list); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index abca57040f37..742e186bfadb 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1143,7 +1143,7 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) file->private_data = data; - return nonseekable_open(inode, file); + return stream_open(inode, file); free: mutex_unlock(&data->mtx); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index c96f63ffe31e..e274bc6e1458 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1301,12 +1301,6 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return put_user(amount, (unsigned int __user *) argp); } - case SIOCGSTAMP: - return sock_get_timestamp(sk, (struct timeval __user *) argp); - - case SIOCGSTAMPNS: - return sock_get_timestampns(sk, (struct timespec __user *) argp); - case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1474,6 +1468,7 @@ static const struct proto_ops rose_proto_ops = { .getname = rose_getname, .poll = datagram_poll, .ioctl = rose_ioctl, + .gettstamp = sock_gettstamp, .listen = rose_listen, .shutdown = sock_no_shutdown, .setsockopt = rose_setsockopt, diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 7af4f99c4a93..094a6621f8e8 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -16,6 +16,7 @@ #include <linux/init.h> static struct sk_buff_head loopback_queue; +#define ROSE_LOOPBACK_LIMIT 1000 static struct timer_list loopback_timer; static void rose_set_loopback_timer(void); @@ -35,29 +36,27 @@ static int rose_loopback_running(void) int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) { - struct sk_buff *skbn; + struct sk_buff *skbn = NULL; - skbn = skb_clone(skb, GFP_ATOMIC); + if (skb_queue_len(&loopback_queue) < ROSE_LOOPBACK_LIMIT) + skbn = skb_clone(skb, GFP_ATOMIC); - kfree_skb(skb); - - if (skbn != NULL) { + if (skbn) { + consume_skb(skb); skb_queue_tail(&loopback_queue, skbn); if (!rose_loopback_running()) rose_set_loopback_timer(); + } else { + kfree_skb(skb); } return 1; } - static void rose_set_loopback_timer(void) { - del_timer(&loopback_timer); - - loopback_timer.expires = jiffies + 10; - add_timer(&loopback_timer); + mod_timer(&loopback_timer, jiffies + 10); } static void rose_loopback_timer(struct timer_list *unused) @@ -68,8 +67,12 @@ static void rose_loopback_timer(struct timer_list *unused) struct sock *sk; unsigned short frametype; unsigned int lci_i, lci_o; + int count; - while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + for (count = 0; count < ROSE_LOOPBACK_LIMIT; count++) { + skb = skb_dequeue(&loopback_queue); + if (!skb) + return; if (skb->len < ROSE_MIN_LEN) { kfree_skb(skb); continue; @@ -106,6 +109,8 @@ static void rose_loopback_timer(struct timer_list *unused) kfree_skb(skb); } } + if (!skb_queue_empty(&loopback_queue)) + mod_timer(&loopback_timer, jiffies + 1); } void __exit rose_loopback_clear(void) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 96f2952bbdfd..ae8c5d7f3bf1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -135,7 +135,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - u16 service_id = srx->srx_service; + u16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -143,6 +143,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) ret = rxrpc_validate_address(rx, srx, len); if (ret < 0) goto error; + service_id = srx->srx_service; lock_sock(&rx->sk); @@ -370,18 +371,22 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check + * @_life: Where to store the life value * * Allow a kernel service to find out whether a call is still alive - ie. we're - * getting ACKs from the server. Returns a number representing the life state - * which can be compared to that returned by a previous call. + * getting ACKs from the server. Passes back in *_life a number representing + * the life state which can be compared to that returned by a previous call and + * return true if the call is still alive. * * If the life state stalls, rxrpc_kernel_probe_life() should be called and * then 2RTT waited. */ -u32 rxrpc_kernel_check_life(const struct socket *sock, - const struct rxrpc_call *call) +bool rxrpc_kernel_check_life(const struct socket *sock, + const struct rxrpc_call *call, + u32 *_life) { - return call->acks_latest; + *_life = call->acks_latest; + return call->state != RXRPC_CALL_COMPLETE; } EXPORT_SYMBOL(rxrpc_kernel_check_life); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 4b1a534d290a..062ca9dc29b8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -654,6 +654,7 @@ struct rxrpc_call { u8 ackr_reason; /* reason to ACK */ u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ + rxrpc_serial_t ackr_first_seq; /* first sequence number received */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8aa2937b069f..fe96881a334d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -604,30 +604,30 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) _enter(""); - if (list_empty(&rxnet->calls)) - return; + if (!list_empty(&rxnet->calls)) { + write_lock(&rxnet->call_lock); - write_lock(&rxnet->call_lock); + while (!list_empty(&rxnet->calls)) { + call = list_entry(rxnet->calls.next, + struct rxrpc_call, link); + _debug("Zapping call %p", call); - while (!list_empty(&rxnet->calls)) { - call = list_entry(rxnet->calls.next, struct rxrpc_call, link); - _debug("Zapping call %p", call); + rxrpc_see_call(call); + list_del_init(&call->link); - rxrpc_see_call(call); - list_del_init(&call->link); + pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->flags, call->events); - pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", - call, atomic_read(&call->usage), - rxrpc_call_states[call->state], - call->flags, call->events); + write_unlock(&rxnet->call_lock); + cond_resched(); + write_lock(&rxnet->call_lock); + } write_unlock(&rxnet->call_lock); - cond_resched(); - write_lock(&rxnet->call_lock); } - write_unlock(&rxnet->call_lock); - atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b6fca8ebb117..8d31fb4c51e1 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -153,7 +153,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, * pass a connection-level abort onto all calls on that connection */ static void rxrpc_abort_calls(struct rxrpc_connection *conn, - enum rxrpc_call_completion compl) + enum rxrpc_call_completion compl, + rxrpc_serial_t serial) { struct rxrpc_call *call; int i; @@ -173,6 +174,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, call->call_id, 0, conn->abort_code, conn->error); + else + trace_rxrpc_rx_abort(call, serial, + conn->abort_code); if (rxrpc_set_call_completion(call, compl, conn->abort_code, conn->error)) @@ -213,8 +217,6 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, conn->state = RXRPC_CONN_LOCALLY_ABORTED; spin_unlock_bh(&conn->state_lock); - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED); - msg.msg_name = &conn->params.peer->srx.transport; msg.msg_namelen = conn->params.peer->srx.transport_len; msg.msg_control = NULL; @@ -242,6 +244,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, len = iov[0].iov_len + iov[1].iov_len; serial = atomic_inc_return(&conn->serial); + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); whdr.serial = htonl(serial); _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code); @@ -321,7 +324,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, conn->error = -ECONNABORTED; conn->abort_code = abort_code; conn->state = RXRPC_CONN_REMOTELY_ABORTED; - rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED); + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9128aa0e40aa..c2c35cf4e308 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -837,7 +837,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, u8 acks[RXRPC_MAXACKS]; } buf; rxrpc_serial_t acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack; + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); @@ -851,13 +851,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, acked_serial = ntohl(buf.ack.serial); first_soft_ack = ntohl(buf.ack.firstPacket); + prev_pkt = ntohl(buf.ack.previousPacket); hard_ack = first_soft_ack - 1; nr_acks = buf.ack.nAcks; summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? buf.ack.reason : RXRPC_ACK__INVALID); trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial, - first_soft_ack, ntohl(buf.ack.previousPacket), + first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) @@ -878,8 +879,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_propose_ack_respond_to_ack); } - /* Discard any out-of-order or duplicate ACKs. */ - if (before_eq(sp->hdr.serial, call->acks_latest)) + /* Discard any out-of-order or duplicate ACKs (outside lock). */ + if (before(first_soft_ack, call->ackr_first_seq) || + before(prev_pkt, call->ackr_prev_seq)) return; buf.info.rxMTU = 0; @@ -890,12 +892,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, spin_lock(&call->input_lock); - /* Discard any out-of-order or duplicate ACKs. */ - if (before_eq(sp->hdr.serial, call->acks_latest)) + /* Discard any out-of-order or duplicate ACKs (inside lock). */ + if (before(first_soft_ack, call->ackr_first_seq) || + before(prev_pkt, call->ackr_prev_seq)) goto out; call->acks_latest_ts = skb->tstamp; call->acks_latest = sp->hdr.serial; + call->ackr_first_seq = first_soft_ack; + call->ackr_prev_seq = prev_pkt; + /* Parse rwind and mtu sizes if provided. */ if (buf.info.rxMTU) rxrpc_input_ackinfo(call, skb, &buf.info); @@ -1155,19 +1161,19 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * handle data received on the local endpoint * - may be called in interrupt context * - * The socket is locked by the caller and this prevents the socket from being - * shut down and the local endpoint from going away, thus sk_user_data will not - * be cleared until this function returns. + * [!] Note that as this is called from the encap_rcv hook, the socket is not + * held locked by the caller and nothing prevents sk_user_data on the UDP from + * being cleared in the middle of processing this function. * * Called with the RCU read lock held from the IP layer via UDP. */ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) { + struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct rxrpc_connection *conn; struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; - struct rxrpc_local *local = udp_sk->sk_user_data; struct rxrpc_peer *peer = NULL; struct rxrpc_sock *rx = NULL; unsigned int channel; @@ -1175,6 +1181,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) _enter("%p", udp_sk); + if (unlikely(!local)) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 15cf42d5b53a..b67dec945498 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -180,7 +180,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. */ - + /* Fall through */ case AF_INET: /* we want to receive ICMP errors */ opt = 1; @@ -304,7 +304,8 @@ nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); - kfree(local); + if (local) + call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index bc05af89fc38..6e84d878053c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -157,6 +157,11 @@ void rxrpc_error_report(struct sock *sk) _enter("%p{%d}", sk, local->debug_id); + /* Clear the outstanding error value on the socket so that it doesn't + * cause kernel_sendmsg() to return it later. + */ + sock_error(sk); + skb = sock_dequeue_err_skb(sk); if (!skb) { _leave("UDP socket errqueue empty"); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 46c9312085b1..bec64deb7b0a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -152,12 +152,13 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, } /* - * Queue a DATA packet for transmission, set the resend timeout and send the - * packet immediately + * Queue a DATA packet for transmission, set the resend timeout and send + * the packet immediately. Returns the error from rxrpc_send_data_packet() + * in case the caller wants to do something with it. */ -static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, - struct sk_buff *skb, bool last, - rxrpc_notify_end_tx_t notify_end_tx) +static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, + struct sk_buff *skb, bool last, + rxrpc_notify_end_tx_t notify_end_tx) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long now; @@ -250,7 +251,8 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, out: rxrpc_free_skb(skb, rxrpc_skb_tx_freed); - _leave(""); + _leave(" = %d", ret); + return ret; } /* @@ -423,9 +425,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (ret < 0) goto out; - rxrpc_queue_packet(rx, call, skb, - !msg_data_left(msg) && !more, - notify_end_tx); + ret = rxrpc_queue_packet(rx, call, skb, + !msg_data_left(msg) && !more, + notify_end_tx); + /* Should check for failure here */ skb = NULL; } } while (msg_data_left(msg) > 0); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5a87e271d35a..683fcc00da49 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -242,7 +242,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, (unsigned long)p->tcfa_tm.lastuse)) continue; - nest = nla_nest_start(skb, n_i); + nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; goto nla_put_failure; @@ -299,7 +299,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct tc_action *p; unsigned long id = 1; - nest = nla_nest_start(skb, 0); + nest = nla_nest_start_noflag(skb, 0); if (nest == NULL) goto nla_put_failure; if (nla_put_string(skb, TCA_KIND, ops->kind)) @@ -776,7 +776,7 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_old(skb, a, bind, ref); @@ -800,7 +800,7 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { a = actions[i]; - nest = nla_nest_start(skb, a->order); + nest = nla_nest_start_noflag(skb, a->order); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_1(skb, a, bind, ref); @@ -849,7 +849,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, + extack); if (err < 0) goto err_out; err = -EINVAL; @@ -964,7 +965,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, int err; int i; - err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + extack); if (err < 0) return err; @@ -1052,7 +1054,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], t->tca__pad1 = 0; t->tca__pad2 = 0; - nest = nla_nest_start(skb, TCA_ACT_TAB); + nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; @@ -1099,7 +1101,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; @@ -1153,7 +1155,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; @@ -1176,7 +1178,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, t->tca__pad1 = 0; t->tca__pad2 = 0; - nest = nla_nest_start(skb, TCA_ACT_TAB); + nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) { NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; @@ -1282,7 +1284,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, size_t attr_size = 0; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); + ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + extack); if (ret < 0) return ret; @@ -1384,8 +1387,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, - extack); + ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca, + TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; @@ -1436,13 +1439,12 @@ static struct nlattr *find_dump_kind(struct nlattr **nla) if (tb1 == NULL) return NULL; - if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), - NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) + if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; - if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) + if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; @@ -1466,8 +1468,8 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) u32 msecs_since = 0; u32 act_count = 0; - ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, - tcaa_policy, cb->extack); + ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb, + TCA_ROOT_MAX, tcaa_policy, cb->extack); if (ret < 0) return ret; @@ -1508,7 +1510,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (!count_attr) goto out_module_put; - nest = nla_nest_start(skb, TCA_ACT_TAB); + nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (nest == NULL) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 3841156aa09f..a0c77faca04b 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -293,7 +293,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL); + ret = nla_parse_nested_deprecated(tb, TCA_ACT_BPF_MAX, nla, + act_bpf_policy, NULL); if (ret < 0) return ret; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 32ae0cd6e31c..8838575cd536 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -111,8 +111,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy, - NULL); + ret = nla_parse_nested_deprecated(tb, TCA_CONNMARK_MAX, nla, + connmark_policy, NULL); if (ret < 0) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 0c77e7bdf6d5..14bb525e355e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -61,7 +61,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_CSUM_MAX, nla, csum_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e540e31069d7..75492b07f324 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -74,7 +74,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_GACT_MAX, nla, gact_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 31c6ffb6abe7..12489f60a979 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -387,7 +387,7 @@ static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) if (list_empty(&ife->metalist)) return 0; - nest = nla_nest_start(skb, TCA_IFE_METALST); + nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; @@ -486,7 +486,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, int ret = 0; int err; - err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, + NULL); if (err < 0) return err; @@ -567,8 +568,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, INIT_LIST_HEAD(&ife->metalist); if (tb[TCA_IFE_METALST]) { - err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], - NULL, NULL); + err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, + tb[TCA_IFE_METALST], NULL, + NULL); if (err) goto metadata_parse_err; err = populate_metalist(ife, tb2, exists, rtnl_held); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 04a0b5c61194..ae6e28ab1cd7 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -113,7 +113,8 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_IPT_MAX, nla, ipt_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 17cc6bd4c57c..c329390342f4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -111,7 +111,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; } - ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); + ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla, + mirred_policy, extack); if (ret < 0) return ret; if (!tb[TCA_MIRRED_PARMS]) { diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index e91bb8eb81ec..51bd1ba02380 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -52,7 +52,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_NAT_MAX, nla, nat_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 287793abfaf9..d790c02b9c6c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -70,8 +70,9 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, goto err_out; } - err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka, - pedit_key_ex_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, + ka, pedit_key_ex_policy, + NULL); if (err) goto err_out; @@ -108,14 +109,15 @@ err_out: static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { - struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX); + struct nlattr *keys_start = nla_nest_start_noflag(skb, + TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; - key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX); + key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; @@ -157,7 +159,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return -EINVAL; } - err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, + pedit_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 2b8581f6ab51..61731944742a 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,42 +22,7 @@ #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> - -struct tcf_police_params { - int tcfp_result; - u32 tcfp_ewma_rate; - s64 tcfp_burst; - u32 tcfp_mtu; - s64 tcfp_mtu_ptoks; - struct psched_ratecfg rate; - bool rate_present; - struct psched_ratecfg peak; - bool peak_present; - struct rcu_head rcu; -}; - -struct tcf_police { - struct tc_action common; - struct tcf_police_params __rcu *params; - - spinlock_t tcfp_lock ____cacheline_aligned_in_smp; - s64 tcfp_toks; - s64 tcfp_ptoks; - s64 tcfp_t_c; -}; - -#define to_police(pc) ((struct tcf_police *)pc) - -/* old policer structure from before tc actions */ -struct tc_police_compat { - u32 index; - int action; - u32 limit; - u32 burst; - u32 mtu; - struct tc_ratespec rate; - struct tc_ratespec peakrate; -}; +#include <net/tc_act/tc_police.h> /* Each policer is serialized by its individual spinlock */ @@ -100,7 +65,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_POLICE_MAX, nla, + police_policy, NULL); if (err < 0) return err; @@ -316,6 +282,20 @@ static void tcf_police_cleanup(struct tc_action *a) kfree_rcu(p, rcu); } +static void tcf_police_stats_update(struct tc_action *a, + u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_police *police = to_police(a); + struct tcf_t *tm = &police->tcf_tm; + + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + if (hw) + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -379,6 +359,7 @@ static struct tc_action_ops act_police_ops = { .kind = "police", .id = TCA_ID_POLICE, .owner = THIS_MODULE, + .stats_update = tcf_police_stats_update, .act = tcf_police_act, .dump = tcf_police_dump, .init = tcf_police_init, diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 4060b0955c97..b2faa43c1ac7 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -45,15 +45,16 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; struct tcf_chain *goto_ch = NULL; + u32 psample_group_num, rate; struct tc_sample *parm; - u32 psample_group_num; struct tcf_sample *s; bool exists = false; int ret, err; if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); + ret = nla_parse_nested_deprecated(tb, TCA_SAMPLE_MAX, nla, + sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || @@ -85,6 +86,12 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; + rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); + if (!rate) { + NL_SET_ERR_MSG(extack, "invalid sample rate"); + err = -EINVAL; + goto put_chain; + } psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { @@ -96,7 +103,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, spin_lock_bh(&s->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); + s->rate = rate; s->psample_group_num = psample_group_num; RCU_INIT_POINTER(s->psample_group, psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 23c8ca5615e5..ead480e6014c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -104,7 +104,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7e1d261a31d2..7ec159b95364 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -114,7 +114,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_SKBEDIT_MAX, nla, + skbedit_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 1d4c324d0a42..186ef98c828f 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -102,7 +102,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_SKBMOD_MAX, nla, + skbmod_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d5aaf90a3971..6a9070511ee8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -76,8 +76,9 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, int err, data_len, opt_len; u8 *data; - err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, - nla, geneve_opt_policy, extack); + err = nla_parse_nested_deprecated(tb, + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); if (err < 0) return err; @@ -125,8 +126,8 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int err, rem, opt_len, len = nla_len(nla), opts_len = 0; const struct nlattr *attr, *head = nla_data(nla); - err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, - enc_opts_policy, extack); + err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); if (err) return err; @@ -235,8 +236,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return -EINVAL; } - err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, - extack); + err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_MAX, nla, + tunnel_key_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; @@ -426,7 +427,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, u8 *src = (u8 *)(info + 1); struct nlattr *start; - start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); if (!start) return -EMSGSIZE; @@ -460,7 +461,7 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, if (!info->options_len) return 0; - start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS); + start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS); if (!start) return -EMSGSIZE; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 0f40d0a74423..39bd9fa3e455 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -124,7 +124,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_VLAN_MAX, nla, vlan_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 99ae30c177c7..d4699156974a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -37,6 +37,8 @@ #include <net/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_csum.h> #include <net/tc_act/tc_gact.h> +#include <net/tc_act/tc_police.h> +#include <net/tc_act/tc_sample.h> #include <net/tc_act/tc_skbedit.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -2006,7 +2008,8 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, replay: tp_created = 0; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); + err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -2217,7 +2220,8 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); + err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -2366,7 +2370,8 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, int err; bool rtnl_held = false; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); + err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -2558,8 +2563,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, - cb->extack); + err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, + NULL, cb->extack); if (err) return err; @@ -2806,7 +2811,8 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); + err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -2937,8 +2943,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, - cb->extack); + err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, cb->extack); if (err) return err; @@ -3111,7 +3117,7 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - nest = nla_nest_start(skb, exts->action); + nest = nla_nest_start_noflag(skb, exts->action); if (nest == NULL) goto nla_put_failure; @@ -3120,7 +3126,7 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) nla_nest_end(skb, nest); } else if (exts->police) { struct tc_action *act = tcf_exts_first_act(exts); - nest = nla_nest_start(skb, exts->police); + nest = nla_nest_start_noflag(skb, exts->police); if (nest == NULL || !act) goto nla_put_failure; if (tcf_action_dump_old(skb, act, 0, 0) < 0) @@ -3229,7 +3235,6 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->tunnel = tcf_tunnel_info(act); } else if (is_tcf_tunnel_release(act)) { entry->id = FLOW_ACTION_TUNNEL_DECAP; - entry->tunnel = tcf_tunnel_info(act); } else if (is_tcf_pedit(act)) { for (k = 0; k < tcf_pedit_nkeys(act); k++) { switch (tcf_pedit_cmd(act, k)) { @@ -3254,6 +3259,18 @@ int tc_setup_flow_action(struct flow_action *flow_action, } else if (is_tcf_skbedit_mark(act)) { entry->id = FLOW_ACTION_MARK; entry->mark = tcf_skbedit_mark(act); + } else if (is_tcf_sample(act)) { + entry->id = FLOW_ACTION_SAMPLE; + entry->sample.psample_group = + tcf_sample_psample_group(act); + entry->sample.trunc_size = tcf_sample_trunc_size(act); + entry->sample.truncate = tcf_sample_truncate(act); + entry->sample.rate = tcf_sample_rate(act); + } else if (is_tcf_police(act)) { + entry->id = FLOW_ACTION_POLICE; + entry->police.burst = tcf_police_tcfp_burst(act); + entry->police.rate_bytes_ps = + tcf_police_rate_bytes_ps(act); } else { goto err_out; } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 687b0af67878..923863f3b0d8 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -185,8 +185,8 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (tca[TCA_OPTIONS] == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], - basic_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], + basic_policy, NULL); if (err < 0) return err; @@ -288,7 +288,7 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = f->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index b4ac58039cb1..27365ed3fe0b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -157,8 +157,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, skip_sw = prog && tc_skip_sw(prog->gen_flags); obj = prog ?: oldprog; - tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, - extack); + tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, extack); cls_bpf.command = TC_CLSBPF_OFFLOAD; cls_bpf.exts = &obj->exts; cls_bpf.prog = prog ? prog->filter : NULL; @@ -468,8 +467,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (tca[TCA_OPTIONS] == NULL) return -EINVAL; - ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy, - NULL); + ret = nla_parse_nested_deprecated(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], + bpf_policy, NULL); if (ret < 0) return ret; @@ -591,7 +590,7 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh, cls_bpf_offload_update_stats(tp, prog); - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 4c1567854f95..35659127e5a3 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -32,6 +32,8 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid = task_get_classid(skb); + if (unlikely(!head)) + return -1; if (!classid) return -1; if (!tcf_em_tree_match(skb, &head->ematches, NULL)) @@ -104,8 +106,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, goto errout; new->handle = handle; new->tp = tp; - err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], - cgroup_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_CGROUP_MAX, + tca[TCA_OPTIONS], cgroup_policy, + NULL); if (err < 0) goto errout; @@ -176,7 +179,7 @@ static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = head->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index eece1ee26930..7bb79ec5b176 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -408,7 +408,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_FLOW_MAX, opt, flow_policy, + NULL); if (err < 0) return err; @@ -629,7 +630,7 @@ static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = f->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c04247b403ed..f6685fc53119 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/rhashtable.h> #include <linux/workqueue.h> +#include <linux/refcount.h> #include <linux/if_ether.h> #include <linux/in6.h> @@ -75,6 +76,7 @@ struct fl_flow_mask { struct list_head filters; struct rcu_work rwork; struct list_head list; + refcount_t refcnt; }; struct fl_flow_tmplt { @@ -86,7 +88,9 @@ struct fl_flow_tmplt { struct cls_fl_head { struct rhashtable ht; + spinlock_t masks_lock; /* Protect masks list */ struct list_head masks; + struct list_head hw_filters; struct rcu_work rwork; struct idr handle_idr; }; @@ -99,11 +103,18 @@ struct cls_fl_filter { struct tcf_result res; struct fl_flow_key key; struct list_head list; + struct list_head hw_list; u32 handle; u32 flags; u32 in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; + /* Flower classifier is unlocked, which means that its reference counter + * can be changed concurrently without any kind of external + * synchronization. Use atomic reference counter to be concurrency-safe. + */ + refcount_t refcnt; + bool deleted; }; static const struct rhashtable_params mask_ht_params = { @@ -304,7 +315,9 @@ static int fl_init(struct tcf_proto *tp) if (!head) return -ENOBUFS; + spin_lock_init(&head->masks_lock); INIT_LIST_HEAD_RCU(&head->masks); + INIT_LIST_HEAD(&head->hw_filters); rcu_assign_pointer(tp->root, head); idr_init(&head->handle_idr); @@ -313,6 +326,7 @@ static int fl_init(struct tcf_proto *tp) static void fl_mask_free(struct fl_flow_mask *mask) { + WARN_ON(!list_empty(&mask->filters)); rhashtable_destroy(&mask->ht); kfree(mask); } @@ -325,22 +339,32 @@ static void fl_mask_free_work(struct work_struct *work) fl_mask_free(mask); } -static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, - bool async) +static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask) { - if (!list_empty(&mask->filters)) + if (!refcount_dec_and_test(&mask->refcnt)) return false; rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params); + + spin_lock(&head->masks_lock); list_del_rcu(&mask->list); - if (async) - tcf_queue_work(&mask->rwork, fl_mask_free_work); - else - fl_mask_free(mask); + spin_unlock(&head->masks_lock); + + tcf_queue_work(&mask->rwork, fl_mask_free_work); return true; } +static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) +{ + /* Flower classifier only changes root pointer during init and destroy. + * Users must obtain reference to tcf_proto instance before calling its + * API, so tp->root pointer is protected from concurrent call to + * fl_destroy() by reference counting. + */ + return rcu_dereference_raw(tp->root); +} + static void __fl_destroy_filter(struct cls_fl_filter *f) { tcf_exts_destroy(&f->exts); @@ -353,37 +377,50 @@ static void fl_destroy_filter_work(struct work_struct *work) struct cls_fl_filter *f = container_of(to_rcu_work(work), struct cls_fl_filter, rwork); - rtnl_lock(); __fl_destroy_filter(f); - rtnl_unlock(); } static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + if (!rtnl_held) + rtnl_lock(); + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false); + spin_lock(&tp->lock); + list_del_init(&f->hw_list); tcf_block_offload_dec(block, &f->flags); + spin_unlock(&tp->lock); + + if (!rtnl_held) + rtnl_unlock(); } static int fl_hw_replace_filter(struct tcf_proto *tp, - struct cls_fl_filter *f, + struct cls_fl_filter *f, bool rtnl_held, struct netlink_ext_ack *extack) { + struct cls_fl_head *head = fl_head_dereference(tp); struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; bool skip_sw = tc_skip_sw(f->flags); - int err; + int err = 0; + + if (!rtnl_held) + rtnl_lock(); cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts)); - if (!cls_flower.rule) - return -ENOMEM; + if (!cls_flower.rule) { + err = -ENOMEM; + goto errout; + } tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_REPLACE; @@ -396,35 +433,51 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); if (err) { kfree(cls_flower.rule); - if (skip_sw) { + if (skip_sw) NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); - return err; - } - return 0; + else + err = 0; + goto errout; } err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw); kfree(cls_flower.rule); if (err < 0) { - fl_hw_destroy_filter(tp, f, NULL); - return err; + fl_hw_destroy_filter(tp, f, true, NULL); + goto errout; } else if (err > 0) { f->in_hw_count = err; + err = 0; + spin_lock(&tp->lock); tcf_block_offload_inc(block, &f->flags); + spin_unlock(&tp->lock); } - if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) - return -EINVAL; + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) { + err = -EINVAL; + goto errout; + } - return 0; + spin_lock(&tp->lock); + list_add(&f->hw_list, &head->hw_filters); + spin_unlock(&tp->lock); +errout: + if (!rtnl_held) + rtnl_unlock(); + + return err; } -static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) +static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, + bool rtnl_held) { struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; + if (!rtnl_held) + rtnl_lock(); + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; @@ -435,27 +488,81 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, cls_flower.stats.lastused); + + if (!rtnl_held) + rtnl_unlock(); } -static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, - struct netlink_ext_ack *extack) +static void __fl_put(struct cls_fl_filter *f) { - struct cls_fl_head *head = rtnl_dereference(tp->root); - bool async = tcf_exts_get_net(&f->exts); - bool last; + if (!refcount_dec_and_test(&f->refcnt)) + return; + + if (tcf_exts_get_net(&f->exts)) + tcf_queue_work(&f->rwork, fl_destroy_filter_work); + else + __fl_destroy_filter(f); +} + +static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle) +{ + struct cls_fl_filter *f; + rcu_read_lock(); + f = idr_find(&head->handle_idr, handle); + if (f && !refcount_inc_not_zero(&f->refcnt)) + f = NULL; + rcu_read_unlock(); + + return f; +} + +static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp, + unsigned long *handle) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + struct cls_fl_filter *f; + + rcu_read_lock(); + while ((f = idr_get_next_ul(&head->handle_idr, handle))) { + /* don't return filters that are being deleted */ + if (refcount_inc_not_zero(&f->refcnt)) + break; + ++(*handle); + } + rcu_read_unlock(); + + return f; +} + +static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, + bool *last, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + *last = false; + + spin_lock(&tp->lock); + if (f->deleted) { + spin_unlock(&tp->lock); + return -ENOENT; + } + + f->deleted = true; + rhashtable_remove_fast(&f->mask->ht, &f->ht_node, + f->mask->filter_ht_params); idr_remove(&head->handle_idr, f->handle); list_del_rcu(&f->list); - last = fl_mask_put(head, f->mask, async); + spin_unlock(&tp->lock); + + *last = fl_mask_put(head, f->mask); if (!tc_skip_hw(f->flags)) - fl_hw_destroy_filter(tp, f, extack); + fl_hw_destroy_filter(tp, f, rtnl_held, extack); tcf_unbind_filter(tp, &f->res); - if (async) - tcf_queue_work(&f->rwork, fl_destroy_filter_work); - else - __fl_destroy_filter(f); + __fl_put(f); - return last; + return 0; } static void fl_destroy_sleepable(struct work_struct *work) @@ -472,13 +579,15 @@ static void fl_destroy_sleepable(struct work_struct *work) static void fl_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { - struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_head *head = fl_head_dereference(tp); struct fl_flow_mask *mask, *next_mask; struct cls_fl_filter *f, *next; + bool last; list_for_each_entry_safe(mask, next_mask, &head->masks, list) { list_for_each_entry_safe(f, next, &mask->filters, list) { - if (__fl_delete(tp, f, extack)) + __fl_delete(tp, f, &last, rtnl_held, extack); + if (last) break; } } @@ -488,11 +597,18 @@ static void fl_destroy(struct tcf_proto *tp, bool rtnl_held, tcf_queue_work(&head->rwork, fl_destroy_sleepable); } +static void fl_put(struct tcf_proto *tp, void *arg) +{ + struct cls_fl_filter *f = arg; + + __fl_put(f); +} + static void *fl_get(struct tcf_proto *tp, u32 handle) { - struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_head *head = fl_head_dereference(tp); - return idr_find(&head->handle_idr, handle); + return __fl_get(head, handle); } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { @@ -768,8 +884,9 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, return -EINVAL; } - err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, - nla, geneve_opt_policy, extack); + err = nla_parse_nested_deprecated(tb, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); if (err < 0) return err; @@ -831,18 +948,18 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; int err, option_len, key_depth, msk_depth = 0; - err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS], - TCA_FLOWER_KEY_ENC_OPTS_MAX, - enc_opts_policy, extack); + err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); if (err) return err; nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { - err = nla_validate_nested(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], - TCA_FLOWER_KEY_ENC_OPTS_MAX, - enc_opts_policy, extack); + err = nla_validate_nested_deprecated(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK], + TCA_FLOWER_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); if (err) return err; @@ -1227,12 +1344,18 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, INIT_LIST_HEAD_RCU(&newmask->filters); - err = rhashtable_insert_fast(&head->ht, &newmask->ht_node, - mask_ht_params); + refcount_set(&newmask->refcnt, 1); + err = rhashtable_replace_fast(&head->ht, &mask->ht_node, + &newmask->ht_node, mask_ht_params); if (err) goto errout_destroy; + /* Wait until any potential concurrent users of mask are finished */ + synchronize_rcu(); + + spin_lock(&head->masks_lock); list_add_tail_rcu(&newmask->list, &head->masks); + spin_unlock(&head->masks_lock); return newmask; @@ -1250,41 +1373,77 @@ static int fl_check_assign_mask(struct cls_fl_head *head, struct fl_flow_mask *mask) { struct fl_flow_mask *newmask; + int ret = 0; + + rcu_read_lock(); - fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params); + /* Insert mask as temporary node to prevent concurrent creation of mask + * with same key. Any concurrent lookups with same key will return + * -EAGAIN because mask's refcnt is zero. It is safe to insert + * stack-allocated 'mask' to masks hash table because we call + * synchronize_rcu() before returning from this function (either in case + * of error or after replacing it with heap-allocated mask in + * fl_create_new_mask()). + */ + fnew->mask = rhashtable_lookup_get_insert_fast(&head->ht, + &mask->ht_node, + mask_ht_params); if (!fnew->mask) { - if (fold) - return -EINVAL; + rcu_read_unlock(); + + if (fold) { + ret = -EINVAL; + goto errout_cleanup; + } newmask = fl_create_new_mask(head, mask); - if (IS_ERR(newmask)) - return PTR_ERR(newmask); + if (IS_ERR(newmask)) { + ret = PTR_ERR(newmask); + goto errout_cleanup; + } fnew->mask = newmask; + return 0; + } else if (IS_ERR(fnew->mask)) { + ret = PTR_ERR(fnew->mask); } else if (fold && fold->mask != fnew->mask) { - return -EINVAL; + ret = -EINVAL; + } else if (!refcount_inc_not_zero(&fnew->mask->refcnt)) { + /* Mask was deleted concurrently, try again */ + ret = -EAGAIN; } + rcu_read_unlock(); + return ret; - return 0; +errout_cleanup: + rhashtable_remove_fast(&head->ht, &mask->ht_node, + mask_ht_params); + /* Wait until any potential concurrent users of mask are finished */ + synchronize_rcu(); + return ret; } static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr, - struct fl_flow_tmplt *tmplt, + struct fl_flow_tmplt *tmplt, bool rtnl_held, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, + err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held, extack); if (err < 0) return err; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + if (!rtnl_held) + rtnl_lock(); tcf_bind_filter(tp, &f->res, base); + if (!rtnl_held) + rtnl_unlock(); } err = fl_set_key(net, tb, &f->key, &mask->key, extack); @@ -1302,25 +1461,52 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, return 0; } +static int fl_ht_insert_unique(struct cls_fl_filter *fnew, + struct cls_fl_filter *fold, + bool *in_ht) +{ + struct fl_flow_mask *mask = fnew->mask; + int err; + + err = rhashtable_lookup_insert_fast(&mask->ht, + &fnew->ht_node, + mask->filter_ht_params); + if (err) { + *in_ht = false; + /* It is okay if filter with same key exists when + * overwriting. + */ + return fold && err == -EEXIST ? 0 : err; + } + + *in_ht = true; + return 0; +} + static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr, bool rtnl_held, struct netlink_ext_ack *extack) { - struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_head *head = fl_head_dereference(tp); struct cls_fl_filter *fold = *arg; struct cls_fl_filter *fnew; struct fl_flow_mask *mask; struct nlattr **tb; + bool in_ht; int err; - if (!tca[TCA_OPTIONS]) - return -EINVAL; + if (!tca[TCA_OPTIONS]) { + err = -EINVAL; + goto errout_fold; + } mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL); - if (!mask) - return -ENOBUFS; + if (!mask) { + err = -ENOBUFS; + goto errout_fold; + } tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!tb) { @@ -1328,8 +1514,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout_mask_alloc; } - err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], - fl_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX, + tca[TCA_OPTIONS], fl_policy, NULL); if (err < 0) goto errout_tb; @@ -1343,6 +1529,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, err = -ENOBUFS; goto errout_tb; } + INIT_LIST_HEAD(&fnew->hw_list); + refcount_set(&fnew->refcnt, 1); err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); if (err < 0) @@ -1358,7 +1546,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr, - tp->chain->tmplt_priv, extack); + tp->chain->tmplt_priv, rtnl_held, extack); if (err) goto errout; @@ -1366,169 +1554,247 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; - if (!handle) { - handle = 1; - err = idr_alloc_u32(&head->handle_idr, fnew, &handle, - INT_MAX, GFP_KERNEL); - } else if (!fold) { - /* user specifies a handle and it doesn't exist */ - err = idr_alloc_u32(&head->handle_idr, fnew, &handle, - handle, GFP_KERNEL); - } + err = fl_ht_insert_unique(fnew, fold, &in_ht); if (err) goto errout_mask; - fnew->handle = handle; - - if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) { - err = -EEXIST; - goto errout_idr; - } - - err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, - fnew->mask->filter_ht_params); - if (err) - goto errout_idr; if (!tc_skip_hw(fnew->flags)) { - err = fl_hw_replace_filter(tp, fnew, extack); + err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack); if (err) - goto errout_mask_ht; + goto errout_ht; } if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + spin_lock(&tp->lock); + + /* tp was deleted concurrently. -EAGAIN will cause caller to lookup + * proto again or create new one, if necessary. + */ + if (tp->deleting) { + err = -EAGAIN; + goto errout_hw; + } + if (fold) { + /* Fold filter was deleted concurrently. Retry lookup. */ + if (fold->deleted) { + err = -EAGAIN; + goto errout_hw; + } + + fnew->handle = handle; + + if (!in_ht) { + struct rhashtable_params params = + fnew->mask->filter_ht_params; + + err = rhashtable_insert_fast(&fnew->mask->ht, + &fnew->ht_node, + params); + if (err) + goto errout_hw; + in_ht = true; + } + + refcount_inc(&fnew->refcnt); rhashtable_remove_fast(&fold->mask->ht, &fold->ht_node, fold->mask->filter_ht_params); - if (!tc_skip_hw(fold->flags)) - fl_hw_destroy_filter(tp, fold, NULL); - } - - *arg = fnew; - - if (fold) { idr_replace(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->list, &fnew->list); + fold->deleted = true; + + spin_unlock(&tp->lock); + + fl_mask_put(head, fold->mask); + if (!tc_skip_hw(fold->flags)) + fl_hw_destroy_filter(tp, fold, rtnl_held, NULL); tcf_unbind_filter(tp, &fold->res); - tcf_exts_get_net(&fold->exts); - tcf_queue_work(&fold->rwork, fl_destroy_filter_work); + /* Caller holds reference to fold, so refcnt is always > 0 + * after this. + */ + refcount_dec(&fold->refcnt); + __fl_put(fold); } else { + if (handle) { + /* user specifies a handle and it doesn't exist */ + err = idr_alloc_u32(&head->handle_idr, fnew, &handle, + handle, GFP_ATOMIC); + + /* Filter with specified handle was concurrently + * inserted after initial check in cls_api. This is not + * necessarily an error if NLM_F_EXCL is not set in + * message flags. Returning EAGAIN will cause cls_api to + * try to update concurrently inserted rule. + */ + if (err == -ENOSPC) + err = -EAGAIN; + } else { + handle = 1; + err = idr_alloc_u32(&head->handle_idr, fnew, &handle, + INT_MAX, GFP_ATOMIC); + } + if (err) + goto errout_hw; + + refcount_inc(&fnew->refcnt); + fnew->handle = handle; list_add_tail_rcu(&fnew->list, &fnew->mask->filters); + spin_unlock(&tp->lock); } + *arg = fnew; + kfree(tb); kfree(mask); return 0; -errout_mask_ht: - rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, - fnew->mask->filter_ht_params); - -errout_idr: - if (!fold) - idr_remove(&head->handle_idr, fnew->handle); - +errout_ht: + spin_lock(&tp->lock); +errout_hw: + fnew->deleted = true; + spin_unlock(&tp->lock); + if (!tc_skip_hw(fnew->flags)) + fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL); + if (in_ht) + rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); errout_mask: - fl_mask_put(head, fnew->mask, false); - + fl_mask_put(head, fnew->mask); errout: - tcf_exts_destroy(&fnew->exts); - kfree(fnew); + __fl_put(fnew); errout_tb: kfree(tb); errout_mask_alloc: kfree(mask); +errout_fold: + if (fold) + __fl_put(fold); return err; } static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { - struct cls_fl_head *head = rtnl_dereference(tp->root); + struct cls_fl_head *head = fl_head_dereference(tp); struct cls_fl_filter *f = arg; + bool last_on_mask; + int err = 0; - rhashtable_remove_fast(&f->mask->ht, &f->ht_node, - f->mask->filter_ht_params); - __fl_delete(tp, f, extack); + err = __fl_delete(tp, f, &last_on_mask, rtnl_held, extack); *last = list_empty(&head->masks); - return 0; + __fl_put(f); + + return err; } static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { - struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; arg->count = arg->skip; - while ((f = idr_get_next_ul(&head->handle_idr, - &arg->cookie)) != NULL) { + while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) { if (arg->fn(tp, f, arg) < 0) { + __fl_put(f); arg->stop = 1; break; } - arg->cookie = f->handle + 1; + __fl_put(f); + arg->cookie++; arg->count++; } } +static struct cls_fl_filter * +fl_get_next_hw_filter(struct tcf_proto *tp, struct cls_fl_filter *f, bool add) +{ + struct cls_fl_head *head = fl_head_dereference(tp); + + spin_lock(&tp->lock); + if (list_empty(&head->hw_filters)) { + spin_unlock(&tp->lock); + return NULL; + } + + if (!f) + f = list_entry(&head->hw_filters, struct cls_fl_filter, + hw_list); + list_for_each_entry_continue(f, &head->hw_filters, hw_list) { + if (!(add && f->deleted) && refcount_inc_not_zero(&f->refcnt)) { + spin_unlock(&tp->lock); + return f; + } + } + + spin_unlock(&tp->lock); + return NULL; +} + static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { - struct cls_fl_head *head = rtnl_dereference(tp->root); struct tc_cls_flower_offload cls_flower = {}; struct tcf_block *block = tp->chain->block; - struct fl_flow_mask *mask; - struct cls_fl_filter *f; + struct cls_fl_filter *f = NULL; int err; - list_for_each_entry(mask, &head->masks, list) { - list_for_each_entry(f, &mask->filters, list) { - if (tc_skip_hw(f->flags)) - continue; - - cls_flower.rule = - flow_rule_alloc(tcf_exts_num_actions(&f->exts)); - if (!cls_flower.rule) - return -ENOMEM; - - tc_cls_common_offload_init(&cls_flower.common, tp, - f->flags, extack); - cls_flower.command = add ? - TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; - cls_flower.cookie = (unsigned long)f; - cls_flower.rule->match.dissector = &mask->dissector; - cls_flower.rule->match.mask = &mask->key; - cls_flower.rule->match.key = &f->mkey; - - err = tc_setup_flow_action(&cls_flower.rule->action, - &f->exts); - if (err) { - kfree(cls_flower.rule); - if (tc_skip_sw(f->flags)) { - NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); - return err; - } - continue; + /* hw_filters list can only be changed by hw offload functions after + * obtaining rtnl lock. Make sure it is not changed while reoffload is + * iterating it. + */ + ASSERT_RTNL(); + + while ((f = fl_get_next_hw_filter(tp, f, add))) { + cls_flower.rule = + flow_rule_alloc(tcf_exts_num_actions(&f->exts)); + if (!cls_flower.rule) { + __fl_put(f); + return -ENOMEM; + } + + tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, + extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.rule->match.dissector = &f->mask->dissector; + cls_flower.rule->match.mask = &f->mask->key; + cls_flower.rule->match.key = &f->mkey; + + err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts); + if (err) { + kfree(cls_flower.rule); + if (tc_skip_sw(f->flags)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); + __fl_put(f); + return err; } + goto next_flow; + } - cls_flower.classid = f->res.classid; + cls_flower.classid = f->res.classid; - err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); - kfree(cls_flower.rule); + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + kfree(cls_flower.rule); - if (err) { - if (add && tc_skip_sw(f->flags)) - return err; - continue; + if (err) { + if (add && tc_skip_sw(f->flags)) { + __fl_put(f); + return err; } - - tc_cls_offload_cnt_update(block, &f->in_hw_count, - &f->flags, add); + goto next_flow; } + + spin_lock(&tp->lock); + tc_cls_offload_cnt_update(block, &f->in_hw_count, &f->flags, + add); + spin_unlock(&tp->lock); +next_flow: + __fl_put(f); } return 0; @@ -1587,8 +1853,8 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!tb) return ERR_PTR(-ENOBUFS); - err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], - fl_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_FLOWER_MAX, + tca[TCA_OPTIONS], fl_policy, NULL); if (err) goto errout_tb; @@ -1786,7 +2052,7 @@ static int fl_dump_key_geneve_opt(struct sk_buff *skb, struct nlattr *nest; int opt_off = 0; - nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); if (!nest) goto nla_put_failure; @@ -1822,7 +2088,7 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, if (!enc_opts->len) return 0; - nest = nla_nest_start(skb, enc_opt_type); + nest = nla_nest_start_noflag(skb, enc_opt_type); if (!nest) goto nla_put_failure; @@ -2061,31 +2327,37 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, struct cls_fl_filter *f = fh; struct nlattr *nest; struct fl_flow_key *key, *mask; + bool skip_hw; if (!f) return skb->len; t->tcm_handle = f->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; + spin_lock(&tp->lock); + if (f->res.classid && nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) - goto nla_put_failure; + goto nla_put_failure_locked; key = &f->key; mask = &f->mask->key; + skip_hw = tc_skip_hw(f->flags); if (fl_dump_key(skb, net, key, mask)) - goto nla_put_failure; - - if (!tc_skip_hw(f->flags)) - fl_hw_update_stats(tp, f); + goto nla_put_failure_locked; if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) - goto nla_put_failure; + goto nla_put_failure_locked; + + spin_unlock(&tp->lock); + + if (!skip_hw) + fl_hw_update_stats(tp, f, rtnl_held); if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) goto nla_put_failure; @@ -2100,6 +2372,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, return skb->len; +nla_put_failure_locked: + spin_unlock(&tp->lock); nla_put_failure: nla_nest_cancel(skb, nest); return -1; @@ -2111,7 +2385,7 @@ static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) struct fl_flow_key *key, *mask; struct nlattr *nest; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; @@ -2144,6 +2418,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .init = fl_init, .destroy = fl_destroy, .get = fl_get, + .put = fl_put, .change = fl_change, .delete = fl_delete, .walk = fl_walk, @@ -2154,6 +2429,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .tmplt_destroy = fl_tmplt_destroy, .tmplt_dump = fl_tmplt_dump, .owner = THIS_MODULE, + .flags = TCF_PROTO_OPS_DOIT_UNLOCKED, }; static int __init cls_fl_init(void) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index ad036b00427d..1d0b39c3932f 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -263,7 +263,8 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!opt) return handle ? -EINVAL : 0; /* Succeed if it is old method. */ - err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_FW_MAX, opt, fw_policy, + NULL); if (err < 0) return err; @@ -402,7 +403,7 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, if (!f->res.classid && !tcf_exts_has_actions(&f->exts)) return skb->len; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 459921bd3d87..db42d97a2006 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,9 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_mall_head *head = rcu_dereference_bh(tp->root); + if (unlikely(!head)) + return -1; + if (tc_skip_sw(head->flags)) return -1; @@ -89,12 +92,29 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, bool skip_sw = tc_skip_sw(head->flags); int err; + cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts)); + if (!cls_mall.rule) + return -ENOMEM; + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = TC_CLSMATCHALL_REPLACE; - cls_mall.exts = &head->exts; cls_mall.cookie = cookie; + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); + if (err) { + kfree(cls_mall.rule); + mall_destroy_hw_filter(tp, head, cookie, NULL); + if (skip_sw) + NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); + else + err = 0; + + return err; + } + err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw); + kfree(cls_mall.rule); + if (err < 0) { mall_destroy_hw_filter(tp, head, cookie, NULL); return err; @@ -130,6 +150,11 @@ static void mall_destroy(struct tcf_proto *tp, bool rtnl_held, static void *mall_get(struct tcf_proto *tp, u32 handle) { + struct cls_mall_head *head = rtnl_dereference(tp->root); + + if (head && head->handle == handle) + return head; + return NULL; } @@ -176,8 +201,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (head) return -EEXIST; - err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS], - mall_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_MATCHALL_MAX, + tca[TCA_OPTIONS], mall_policy, NULL); if (err < 0) return err; @@ -267,13 +292,28 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, if (tc_skip_hw(head->flags)) return 0; + cls_mall.rule = flow_rule_alloc(tcf_exts_num_actions(&head->exts)); + if (!cls_mall.rule) + return -ENOMEM; + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); cls_mall.command = add ? TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; - cls_mall.exts = &head->exts; cls_mall.cookie = (unsigned long)head; + err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts); + if (err) { + kfree(cls_mall.rule); + if (add && tc_skip_sw(head->flags)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action"); + return err; + } + return 0; + } + err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + kfree(cls_mall.rule); + if (err) { if (add && tc_skip_sw(head->flags)) return err; @@ -285,6 +325,23 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void mall_stats_hw_filter(struct tcf_proto *tp, + struct cls_mall_head *head, + unsigned long cookie) +{ + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, NULL); + cls_mall.command = TC_CLSMATCHALL_STATS; + cls_mall.cookie = cookie; + + tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false); + + tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, + cls_mall.stats.pkts, cls_mall.stats.lastused); +} + static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { @@ -296,9 +353,12 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, if (!head) return skb->len; + if (!tc_skip_hw(head->flags)) + mall_stats_hw_filter(tp, head, (unsigned long)head); + t->tcm_handle = head->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index f006af23b64a..eeff5bbfb912 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -484,7 +484,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return handle ? -EINVAL : 0; - err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, opt, + route4_policy, NULL); if (err < 0) return err; @@ -607,7 +608,7 @@ static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = f->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 0719a21d9c41..a4688bb92f43 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -497,7 +497,8 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return handle ? -EINVAL : 0; - err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy, + NULL); if (err < 0) return err; @@ -706,7 +707,7 @@ static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = f->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 24e0a62a65cc..9f4f4203c388 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -510,7 +510,8 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, if (!opt) return 0; - err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt, + tcindex_policy, NULL); if (err < 0) return err; @@ -601,7 +602,7 @@ static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, tp, fh, skb, t, p, r); pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 48e76a3acf8a..4b8710a266cc 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -847,7 +847,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, /* Similarly success statistics must be moved as pointers */ new->pcpu_success = n->pcpu_success; #endif - memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + memcpy(&new->sel, s, struct_size(s, keys, s->nkeys)); if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) { kfree(new); @@ -884,7 +884,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } } - err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_U32_MAX, opt, u32_policy, + extack); if (err < 0) return err; @@ -1294,7 +1295,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, t->tcm_handle = n->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index a5f34e930eff..60c26b8294b5 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -120,8 +120,8 @@ static int em_ipt_change(struct net *net, void *data, int data_len, struct xt_match *match; int mdata_len, ret; - ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, - NULL); + ret = nla_parse_deprecated(tb, TCA_EM_IPT_MAX, data, data_len, + em_ipt_policy, NULL); if (ret < 0) return ret; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d6e97115500b..28dfa8f2a4ea 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -912,7 +912,8 @@ static int em_meta_change(struct net *net, void *data, int len, struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; - err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL); + err = nla_parse_deprecated(tb, TCA_EM_META_MAX, data, len, + meta_policy, NULL); if (err < 0) goto errout; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 1331a4c2d8ff..7b86c2a44746 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -314,7 +314,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, if (!nla) return 0; - err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla, + em_policy, NULL); if (err < 0) goto errout; @@ -440,14 +441,14 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) struct nlattr *top_start; struct nlattr *list_start; - top_start = nla_nest_start(skb, tlv); + top_start = nla_nest_start_noflag(skb, tlv); if (top_start == NULL) goto nla_put_failure; if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) goto nla_put_failure; - list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); + list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) goto nla_put_failure; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index fb8f138b9776..607e84d67c33 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -479,7 +479,8 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, u16 *tab = NULL; int err; - err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, + extack); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) { @@ -542,7 +543,7 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) { struct nlattr *nest; - nest = nla_nest_start(skb, TCA_STAB); + nest = nla_nest_start_noflag(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) @@ -998,6 +999,19 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb, qdisc_put(old); } +static void qdisc_clear_nolock(struct Qdisc *sch) +{ + sch->flags &= ~TCQ_F_NOLOCK; + if (!(sch->flags & TCQ_F_CPUSTATS)) + return; + + free_percpu(sch->cpu_bstats); + free_percpu(sch->cpu_qstats); + sch->cpu_bstats = NULL; + sch->cpu_qstats = NULL; + sch->flags &= ~TCQ_F_CPUSTATS; +} + /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * @@ -1076,7 +1090,7 @@ skip: /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && parent && !(parent->flags & TCQ_F_NOLOCK)) - new->flags &= ~TCQ_F_NOLOCK; + qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; @@ -1410,8 +1424,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, - extack); + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -1495,8 +1509,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, - extack); + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; @@ -1730,8 +1744,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, - rtm_tca_policy, cb->extack); + err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, + rtm_tca_policy, cb->extack); if (err < 0) return err; @@ -1959,8 +1973,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, - extack); + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); if (err < 0) return err; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index d714d3747bcb..ae506c7906cd 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -223,7 +223,8 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, if (opt == NULL) return -EINVAL; - error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy, NULL); + error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy, + NULL); if (error < 0) return error; @@ -609,7 +610,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, tcm->tcm_handle = flow->common.classid; tcm->tcm_info = flow->q->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index acc9b9da985f..53a80bc6b13a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1517,16 +1517,27 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) { + int wlen = skb_network_offset(skb); u8 dscp; - switch (skb->protocol) { + switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; if (wash && dscp) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); return dscp; case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + return 0; + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; if (wash && dscp) ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); @@ -2520,7 +2531,8 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, + extack); if (err < 0) return err; @@ -2724,7 +2736,7 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; @@ -2795,7 +2807,7 @@ nla_put_failure: static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { - struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tstats, *ts; int i; @@ -2825,7 +2837,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) #undef PUT_STAT_U32 #undef PUT_STAT_U64 - tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); if (!tstats) goto nla_put_failure; @@ -2842,7 +2854,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[q->tin_order[i]]; - ts = nla_nest_start(d->skb, i + 1); + ts = nla_nest_start_noflag(d->skb, i + 1); if (!ts) goto nla_put_failure; @@ -2962,7 +2974,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (flow) { ktime_t now = ktime_get(); - stats = nla_nest_start(d->skb, TCA_STATS_APP); + stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) return -1; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 4dc05409e3fb..ba4b33b74dd8 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1149,7 +1149,8 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, + extack); if (err < 0) return err; @@ -1305,7 +1306,7 @@ static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) struct cbq_sched_data *q = qdisc_priv(sch); struct nlattr *nest; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (cbq_dump_attr(skb, &q->link) < 0) @@ -1340,7 +1341,7 @@ cbq_dump_class(struct Qdisc *sch, unsigned long arg, tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->q->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (cbq_dump_attr(skb, cl) < 0) @@ -1358,9 +1359,11 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; + __u32 qlen; cl->xstats.avgidle = cl->avgidle; cl->xstats.undertime = 0; + qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog); if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; @@ -1368,7 +1371,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); @@ -1471,7 +1474,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t return -EINVAL; } - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy, + extack); if (err < 0) return err; @@ -1665,17 +1669,13 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl = (struct cbq_class *)arg; - unsigned int qlen, backlog; if (cl->filters || cl->children || cl == &q->link) return -EBUSY; sch_tree_lock(sch); - qlen = cl->q->q.qlen; - backlog = cl->q->qstats.backlog; - qdisc_reset(cl->q); - qdisc_tree_reduce_backlog(cl->q, qlen, backlog); + qdisc_purge_queue(cl->q); if (cl->next_alive) cbq_deactivate_class(cl); diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index c6a502933fe7..8077c846f5bf 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -61,16 +61,20 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <net/netevent.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> +static LIST_HEAD(cbs_list); +static DEFINE_SPINLOCK(cbs_list_lock); + #define BYTES_PER_KBIT (1000LL / 8) struct cbs_sched_data { bool offload; int queue; - s64 port_rate; /* in bytes/s */ + atomic64_t port_rate; /* in bytes/s */ s64 last; /* timestamp in ns */ s64 credits; /* in bytes */ s32 locredit; /* in bytes */ @@ -82,6 +86,7 @@ struct cbs_sched_data { struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); struct Qdisc *qdisc; + struct list_head cbs_list; }; static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -181,6 +186,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + if (atomic64_read(&q->port_rate) == -1) { + WARN_ONCE(1, "cbs: dequeue() called with unknown port rate."); + return NULL; + } + if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -207,7 +217,8 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) /* As sendslope is a negative number, this will decrease the * amount of q->credits. */ - credits = credits_from_len(len, q->sendslope, q->port_rate); + credits = credits_from_len(len, q->sendslope, + atomic64_read(&q->port_rate)); credits += q->credits; q->credits = max_t(s64, credits, q->locredit); @@ -294,6 +305,50 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, return 0; } +static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) +{ + struct ethtool_link_ksettings ecmd; + int port_rate = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT; + + atomic64_set(&q->port_rate, port_rate); + netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->port_rate), + ecmd.base.speed); +} + +static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct cbs_sched_data *q; + struct net_device *qdev; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&cbs_list_lock); + list_for_each_entry(q, &cbs_list, cbs_list) { + qdev = qdisc_dev(q->qdisc); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&cbs_list_lock); + + if (found) + cbs_set_port_rate(dev, q); + + return NOTIFY_DONE; +} + static int cbs_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -303,7 +358,8 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, struct tc_cbs_qopt *qopt; int err; - err = nla_parse_nested(tb, TCA_CBS_MAX, opt, cbs_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, opt, cbs_policy, + extack); if (err < 0) return err; @@ -315,16 +371,7 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_CBS_PARMS]); if (!qopt->offload) { - struct ethtool_link_ksettings ecmd; - s64 link_speed; - - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; - - q->port_rate = link_speed * 1000 * BYTES_PER_KBIT; - + cbs_set_port_rate(dev, q); cbs_disable_offload(dev, q); } else { err = cbs_enable_offload(dev, q, qopt, extack); @@ -347,6 +394,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, { struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory"); @@ -367,7 +415,17 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, qdisc_watchdog_init(&q->watchdog, sch); - return cbs_change(sch, opt, extack); + err = cbs_change(sch, opt, extack); + if (err) + return err; + + if (!q->offload) { + spin_lock(&cbs_list_lock); + list_add(&q->cbs_list, &cbs_list); + spin_unlock(&cbs_list_lock); + } + + return 0; } static void cbs_destroy(struct Qdisc *sch) @@ -375,8 +433,11 @@ static void cbs_destroy(struct Qdisc *sch) struct cbs_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - qdisc_watchdog_cancel(&q->watchdog); + spin_lock(&cbs_list_lock); + list_del(&q->cbs_list); + spin_unlock(&cbs_list_lock); + qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); if (q->qdisc) @@ -389,7 +450,7 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_cbs_qopt opt = { }; struct nlattr *nest; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; @@ -487,14 +548,24 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block cbs_device_notifier = { + .notifier_call = cbs_dev_notifier, +}; + static int __init cbs_module_init(void) { + int err = register_netdevice_notifier(&cbs_device_notifier); + + if (err) + return err; + return register_qdisc(&cbs_qdisc_ops); } static void __exit cbs_module_exit(void) { unregister_qdisc(&cbs_qdisc_ops); + unregister_netdevice_notifier(&cbs_device_notifier); } module_init(cbs_module_init) module_exit(cbs_module_exit) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index eafc0d17d174..370dbcf49e8b 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -358,7 +358,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt, + choke_policy, NULL); if (err < 0) return err; @@ -452,7 +453,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) .Scell_log = q->parms.Scell_log, }; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 17cd81f84b5d..25ef172c23df 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -141,7 +141,8 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, + codel_policy, NULL); if (err < 0) return err; @@ -217,7 +218,7 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 09b800991065..ffcd6654c39d 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -50,15 +50,6 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) return container_of(clc, struct drr_class, common); } -static void drr_purge_queue(struct drr_class *cl) -{ - unsigned int len = cl->qdisc->q.qlen; - unsigned int backlog = cl->qdisc->qstats.backlog; - - qdisc_reset(cl->qdisc); - qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); -} - static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = { [TCA_DRR_QUANTUM] = { .type = NLA_U32 }, }; @@ -79,7 +70,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -EINVAL; } - err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_DRR_MAX, opt, drr_policy, + extack); if (err < 0) return err; @@ -167,7 +159,7 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); - drr_purge_queue(cl); + qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); @@ -253,7 +245,7 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg, tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) @@ -269,7 +261,8 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct drr_class *cl = (struct drr_class *)arg; - __u32 qlen = cl->qdisc->q.qlen; + __u32 qlen = qdisc_qlen_sum(cl->qdisc); + struct Qdisc *cl_q = cl->qdisc; struct tc_drr_stats xstats; memset(&xstats, 0, sizeof(xstats)); @@ -279,7 +272,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0) + gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 42471464ded3..3deeb06eaecf 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -132,7 +132,8 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, if (!opt) goto errout; - err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, + dsmark_policy, NULL); if (err < 0) goto errout; @@ -353,7 +354,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, if (err) return err; - err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, + dsmark_policy, NULL); if (err < 0) goto errout; @@ -432,7 +434,7 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); tcm->tcm_info = p->q->handle; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || @@ -451,7 +453,7 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) struct dsmark_qdisc_data *p = qdisc_priv(sch); struct nlattr *opts = NULL; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 1150f22983df..db0c2ba1d156 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -351,7 +351,8 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } - err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy, + extack); if (err < 0) return err; @@ -460,7 +461,7 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_etf_qopt opt = { }; struct nlattr *nest; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 1a662f2bb7bb..26a94e5cd5df 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -54,10 +54,23 @@ #include <net/tcp_states.h> #include <net/tcp.h> +struct fq_skb_cb { + u64 time_to_send; +}; + +static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); + return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; +} + /* - * Per flow structure, dynamically allocated + * Per flow structure, dynamically allocated. + * If packets have monotically increasing time_to_send, they are placed in O(1) + * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { + struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ @@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) */ sk = (struct sock *)((hash << 1) | 1UL); skb_orphan(skb); + } else if (sk->sk_state == TCP_CLOSE) { + unsigned long hash = skb_get_hash(skb) & q->orphan_mask; + /* + * Sockets in TCP_CLOSE are non connected. + * Typical use case is UDP sockets, they can send packets + * with sendto() to many different destinations. + * We probably could use a generic bit advertising + * non connected sockets, instead of sk_state == TCP_CLOSE, + * if we care enough. + */ + sk = (struct sock *)((hash << 1) | 1UL); } root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; @@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) * It not, we need to refill credit with * initial quantum */ - if (unlikely(skb->sk && + if (unlikely(skb->sk == sk && f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; @@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) q->stat_allocation_errors++; return &q->internal; } + /* f->t_root is already zeroed after kmem_cache_zalloc() */ + fq_flow_set_detached(f); f->sk = sk; - if (skb->sk) + if (skb->sk == sk) f->socket_hash = sk->sk_hash; f->credit = q->initial_quantum; @@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) return f; } +static struct sk_buff *fq_peek(struct fq_flow *flow) +{ + struct sk_buff *skb = skb_rb_first(&flow->t_root); + struct sk_buff *head = flow->head; + + if (!skb) + return head; + + if (!head) + return skb; + + if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) + return skb; + return head; +} + +static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) +{ + if (skb == flow->head) { + flow->head = skb->next; + } else { + rb_erase(&skb->rbnode, &flow->t_root); + skb->dev = qdisc_dev(sch); + } +} /* remove one skb from head of flow queue */ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) { - struct sk_buff *skb = flow->head; + struct sk_buff *skb = fq_peek(flow); if (skb) { - flow->head = skb->next; + fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) { - struct sk_buff *head = flow->head; + struct rb_node **p, *parent; + struct sk_buff *head, *aux; - skb->next = NULL; - if (!head) - flow->head = skb; - else - flow->tail->next = skb; + fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); + + head = flow->head; + if (!head || + fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { + if (!head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; + return; + } + + p = &flow->t_root.rb_node; + parent = NULL; - flow->tail = skb; + while (*p) { + parent = *p; + aux = rb_to_skb(parent); + if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &flow->t_root); } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -450,9 +523,9 @@ begin: goto begin; } - skb = f->head; + skb = fq_peek(f); if (skb) { - u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), + u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); if (now < time_next_packet) { @@ -533,6 +606,15 @@ out: static void fq_flow_purge(struct fq_flow *flow) { + struct rb_node *p = rb_first(&flow->t_root); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + rb_erase(&skb->rbnode, &flow->t_root); + rtnl_kfree_skbs(skb, skb); + } rtnl_kfree_skbs(flow->head, flow->tail); flow->head = NULL; flow->qlen = 0; @@ -684,7 +766,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, + NULL); if (err < 0) return err; @@ -823,7 +906,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) u64 ce_threshold = q->ce_threshold; struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index cd04d40c30b6..08d85370b97c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -387,8 +387,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy, - NULL); + err = nla_parse_nested_deprecated(tb, TCA_FQ_CODEL_MAX, opt, + fq_codel_policy, NULL); if (err < 0) return err; if (tb[TCA_FQ_CODEL_FLOWS]) { @@ -527,7 +527,7 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a117d9260558..cce1e9ee85af 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -32,6 +32,7 @@ #include <net/pkt_sched.h> #include <net/dst.h> #include <trace/events/qdisc.h> +#include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ @@ -68,7 +69,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -108,7 +109,7 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; @@ -118,52 +119,36 @@ static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, spin_unlock(lock); } -static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - while (skb) { - struct sk_buff *next = skb->next; - - __skb_queue_tail(&q->gso_skb, skb); - q->qstats.requeues++; - qdisc_qstats_backlog_inc(q, skb); - q->q.qlen++; /* it's still part of the queue */ + spinlock_t *lock = NULL; - skb = next; + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); } - __netif_schedule(q); - return 0; -} - -static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) -{ - spinlock_t *lock = qdisc_lock(q); - - spin_lock(lock); while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); - qdisc_qstats_cpu_requeues_inc(q); - qdisc_qstats_cpu_backlog_inc(q, skb); - qdisc_qstats_atomic_qlen_inc(q); + /* it's still part of the queue */ + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); + q->q.qlen++; + } skb = next; } - spin_unlock(lock); - + if (lock) + spin_unlock(lock); __netif_schedule(q); - - return 0; -} - -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) -{ - if (q->flags & TCQ_F_NOLOCK) - return dev_requeue_skb_locked(skb, q); - else - return __dev_requeue_skb(skb, q); } static void try_bulk_dequeue_skb(struct Qdisc *q, @@ -252,7 +237,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); - qdisc_qstats_atomic_qlen_dec(q); + qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; @@ -457,6 +442,7 @@ static void dev_watchdog(struct timer_list *t) } if (some_queue_timedout) { + trace_net_dev_xmit_timeout(dev, i); WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", dev->name, netdev_drivername(dev), i); dev->netdev_ops->ndo_tx_timeout(dev); @@ -645,11 +631,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, if (unlikely(err)) return qdisc_drop_cpu(skb, qdisc, to_free); - qdisc_qstats_atomic_qlen_inc(qdisc); - /* Note: skb can not be used after skb_array_produce(), - * so we better not use qdisc_qstats_cpu_backlog_inc() - */ - this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len); + qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } @@ -668,9 +650,9 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) skb = __skb_array_consume(q); } if (likely(skb)) { - qdisc_qstats_cpu_backlog_dec(qdisc, skb); - qdisc_bstats_cpu_update(qdisc, skb); - qdisc_qstats_atomic_qlen_dec(qdisc); + qdisc_update_stats_at_dequeue(qdisc, skb); + } else { + qdisc->empty = true; } return skb; @@ -714,6 +696,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; + q->qlen = 0; } } @@ -880,6 +863,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->empty = true; dev_hold(dev); refcount_set(&sch->refcnt, 1); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 234afbf9115b..dfa657da100f 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -538,7 +538,8 @@ static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry) struct nlattr *tb[TCA_GRED_VQ_MAX + 1]; u32 dp; - nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL); + nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, + gred_vq_policy, NULL); dp = nla_get_u32(tb[TCA_GRED_VQ_DP]); @@ -568,8 +569,8 @@ static int gred_vq_validate(struct gred_sched *table, u32 cdp, int err; u32 dp; - err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, - extack); + err = nla_parse_nested_deprecated(tb, TCA_GRED_VQ_MAX, entry, + gred_vq_policy, extack); if (err < 0) return err; @@ -610,8 +611,8 @@ static int gred_vqs_validate(struct gred_sched *table, u32 cdp, const struct nlattr *attr; int rem, err; - err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX, - gred_vqe_policy, extack); + err = nla_validate_nested_deprecated(vqs, TCA_GRED_VQ_ENTRY_MAX, + gred_vqe_policy, extack); if (err < 0) return err; @@ -650,7 +651,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, + extack); if (err < 0) return err; @@ -737,7 +739,8 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_GRED_MAX, opt, gred_policy, + extack); if (err < 0) return err; @@ -772,7 +775,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (gred_offload_dump_stats(sch)) goto nla_put_failure; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) @@ -790,7 +793,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; /* Old style all-in-one dump of VQs */ - parms = nla_nest_start(skb, TCA_GRED_PARMS); + parms = nla_nest_start_noflag(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; @@ -841,7 +844,7 @@ append_opt: nla_nest_end(skb, parms); /* Dump the VQs again, in more structured way */ - vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST); + vqs = nla_nest_start_noflag(skb, TCA_GRED_VQ_LIST); if (!vqs) goto nla_put_failure; @@ -852,7 +855,7 @@ append_opt: if (!q) continue; - vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY); + vq = nla_nest_start_noflag(skb, TCA_GRED_VQ_ENTRY); if (!vq) goto nla_put_failure; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 24cc220a3218..433f2190960f 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -845,16 +845,6 @@ qdisc_peek_len(struct Qdisc *sch) } static void -hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) -{ - unsigned int len = cl->qdisc->q.qlen; - unsigned int backlog = cl->qdisc->qstats.backlog; - - qdisc_reset(cl->qdisc); - qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); -} - -static void hfsc_adjust_levels(struct hfsc_class *cl) { struct hfsc_class *p; @@ -936,7 +926,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_HFSC_MAX, opt, hfsc_policy, + NULL); if (err < 0) return err; @@ -1076,7 +1067,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, qdisc_class_hash_insert(&q->clhash, &cl->cl_common); list_add_tail(&cl->siblings, &parent->children); if (parent->level == 0) - hfsc_purge_queue(sch, parent); + qdisc_purge_queue(parent->qdisc); hfsc_adjust_levels(parent); sch_tree_unlock(sch); @@ -1112,7 +1103,7 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg) list_del(&cl->siblings); hfsc_adjust_levels(cl->cl_parent); - hfsc_purge_queue(sch, cl); + qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->cl_common); sch_tree_unlock(sch); @@ -1310,7 +1301,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, if (cl->level == 0) tcm->tcm_info = cl->qdisc->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) @@ -1328,8 +1319,9 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct hfsc_class *cl = (struct hfsc_class *)arg; struct tc_hfsc_stats xstats; + __u32 qlen; - cl->qstats.backlog = cl->qdisc->qstats.backlog; + qdisc_qstats_qlen_backlog(cl->qdisc, &qlen, &cl->qstats.backlog); xstats.level = cl->level; xstats.period = cl->cl_vtperiod; xstats.work = cl->cl_total; @@ -1337,7 +1329,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0) + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 9d6a47697406..a28e09b1609c 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -518,7 +518,8 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, + NULL); if (err < 0) return err; @@ -654,7 +655,7 @@ static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 30f9da7e1076..909370049fca 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -165,7 +165,8 @@ struct htb_sched { /* non shaped skbs; let them go directly thru */ struct qdisc_skb_head direct_queue; - long direct_pkts; + u32 direct_pkts; + u32 overlimits; struct qdisc_watchdog watchdog; @@ -533,8 +534,10 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) if (new_mode == cl->cmode) return; - if (new_mode == HTB_CANT_SEND) + if (new_mode == HTB_CANT_SEND) { cl->overlimits++; + q->overlimits++; + } if (cl->prio_activity) { /* not necessary: speed optimization */ if (cl->cmode != HTB_CANT_SEND) @@ -937,7 +940,6 @@ ok: goto ok; } } - qdisc_qstats_overlimit(sch); if (likely(next_event > q->now)) qdisc_watchdog_schedule_ns(&q->watchdog, next_event); else @@ -1012,7 +1014,8 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, if (err) return err; - err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy, + NULL); if (err < 0) return err; @@ -1047,6 +1050,7 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest; struct tc_htb_glob gopt; + sch->qstats.overlimits = q->overlimits; /* Its safe to not acquire qdisc lock. As we hold RTNL, * no change can happen on the qdisc parameters. */ @@ -1057,7 +1061,7 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) gopt.defcls = q->defcls; gopt.debug = 0; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) || @@ -1086,7 +1090,7 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, if (!cl->level && cl->leaf.q) tcm->tcm_info = cl->leaf.q->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -1127,10 +1131,9 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) }; __u32 qlen = 0; - if (!cl->level && cl->leaf.q) { - qlen = cl->leaf.q->q.qlen; - qs.backlog = cl->leaf.q->qstats.backlog; - } + if (!cl->level && cl->leaf.q) + qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog); + cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), INT_MIN, INT_MAX); cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), @@ -1270,13 +1273,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); - if (!cl->level) { - unsigned int qlen = cl->leaf.q->q.qlen; - unsigned int backlog = cl->leaf.q->qstats.backlog; - - qdisc_reset(cl->leaf.q); - qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog); - } + if (!cl->level) + qdisc_purge_queue(cl->leaf.q); /* delete from hash and active; remainder in destroy_class */ qdisc_class_hash_remove(&q->clhash, &cl->common); @@ -1316,7 +1314,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!opt) goto failure; - err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy, + NULL); if (err < 0) goto failure; @@ -1404,12 +1403,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, classid, NULL); sch_tree_lock(sch); if (parent && !parent->level) { - unsigned int qlen = parent->leaf.q->q.qlen; - unsigned int backlog = parent->leaf.q->qstats.backlog; - /* turn parent into inner node */ - qdisc_reset(parent->leaf.q); - qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog); + qdisc_purge_queue(parent->leaf.q); qdisc_put(parent->leaf.q); if (parent->prio_activity) htb_deactivate(q, parent); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index ce3f55259d0d..0bac926b46c7 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -106,7 +106,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) { struct nlattr *nest; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 203659bc3906..3a3312467692 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -249,7 +249,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + qdisc_qstats_copy(d, sch) < 0) return -1; return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index d364e63c396d..d05086dc3866 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -125,8 +125,9 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, int nested_len = nla_len(nla) - NLA_ALIGN(len); if (nested_len >= nla_attr_size(0)) - return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), - nested_len, policy, NULL); + return nla_parse_deprecated(tb, maxtype, + nla_data(nla) + NLA_ALIGN(len), + nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; @@ -349,7 +350,7 @@ static int dump_rates(struct mqprio_sched *priv, int i; if (priv->flags & TC_MQPRIO_F_MIN_RATE) { - nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64); + nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MIN_RATE64); if (!nest) goto nla_put_failure; @@ -363,7 +364,7 @@ static int dump_rates(struct mqprio_sched *priv, } if (priv->flags & TC_MQPRIO_F_MAX_RATE) { - nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64); + nest = nla_nest_start_noflag(skb, TCA_MQPRIO_MAX_RATE64); if (!nest) goto nla_put_failure; @@ -561,8 +562,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, - &sch->qstats, sch->q.qlen) < 0) + qdisc_qstats_copy(d, sch) < 0) return -1; } return 0; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 7410ce4d0321..35b03ae08e0f 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -201,9 +201,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, for (i = q->bands; i < q->max_bands; i++) { if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; + q->queues[i] = &noop_qdisc; - qdisc_tree_reduce_backlog(child, child->q.qlen, - child->qstats.backlog); + qdisc_tree_flush_backlog(child); qdisc_put(child); } } @@ -225,9 +225,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); if (old != &noop_qdisc) { - qdisc_tree_reduce_backlog(old, - old->q.qlen, - old->qstats.backlog); + qdisc_tree_flush_backlog(old); qdisc_put(old); } sch_tree_unlock(sch); @@ -344,7 +342,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) + qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index cc9d8133afcd..78aa76b0da2e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -935,8 +935,9 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, } if (nested_len >= nla_attr_size(0)) - return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), - nested_len, policy, NULL); + return nla_parse_deprecated(tb, maxtype, + nla_data(nla) + NLA_ALIGN(len), + nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; @@ -1079,7 +1080,7 @@ static int dump_loss_model(const struct netem_sched_data *q, { struct nlattr *nest; - nest = nla_nest_start(skb, TCA_NETEM_LOSS); + nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 1cc0c7b74aa3..8fa129d3943e 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -216,7 +216,8 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, + NULL); if (err < 0) return err; @@ -491,7 +492,7 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *opts; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 847141cd900f..d519b21535b3 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -216,12 +216,8 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); - for (i = q->bands; i < oldbands; i++) { - struct Qdisc *child = q->queues[i]; - - qdisc_tree_reduce_backlog(child, child->q.qlen, - child->qstats.backlog); - } + for (i = q->bands; i < oldbands; i++) + qdisc_tree_flush_backlog(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; @@ -365,7 +361,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl_q->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0) + qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 29f5c4a24688..3f9e8b425ac6 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -217,15 +217,6 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) return container_of(clc, struct qfq_class, common); } -static void qfq_purge_queue(struct qfq_class *cl) -{ - unsigned int len = cl->qdisc->q.qlen; - unsigned int backlog = cl->qdisc->qstats.backlog; - - qdisc_reset(cl->qdisc); - qdisc_tree_reduce_backlog(cl->qdisc, len, backlog); -} - static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, [TCA_QFQ_LMAX] = { .type = NLA_U32 }, @@ -419,8 +410,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -EINVAL; } - err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy, - NULL); + err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], + qfq_policy, NULL); if (err < 0) return err; @@ -551,7 +542,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) sch_tree_lock(sch); - qfq_purge_queue(cl); + qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); @@ -628,7 +619,7 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || @@ -655,8 +646,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, - &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0) + qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 9df9942340ea..1e68a13bb66b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -205,7 +205,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_RED_MAX, opt, red_policy, + NULL); if (err < 0) return err; @@ -233,8 +234,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, q->flags = ctl->flags; q->limit = ctl->limit; if (child) { - qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, - q->qdisc->qstats.backlog); + qdisc_tree_flush_backlog(q->qdisc); old_child = q->qdisc; q->qdisc = child; } @@ -319,7 +319,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (err) goto nla_put_failure; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index bab506b01a32..b245d6a2068d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -499,7 +499,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, int err; if (opt) { - err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt, + sfb_policy, NULL); if (err < 0) return -EINVAL; @@ -521,8 +522,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt, qdisc_hash_add(child, true); sch_tree_lock(sch); - qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, - q->qdisc->qstats.backlog); + qdisc_tree_flush_backlog(q->qdisc); qdisc_put(q->qdisc); q->qdisc = child; @@ -581,7 +581,7 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) }; sch->qstats.backlog = q->qdisc->qstats.backlog; - opts = nla_nest_start(skb, TCA_OPTIONS); + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 206e4dbed12f..9ecfb8f5902a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -13,13 +13,18 @@ #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> +#include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> +#include <linux/rcupdate.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> +static LIST_HEAD(taprio_list); +static DEFINE_SPINLOCK(taprio_list_lock); + #define TAPRIO_ALL_GATES_OPEN -1 struct sched_entry { @@ -37,24 +42,88 @@ struct sched_entry { u8 command; }; +struct sched_gate_list { + struct rcu_head rcu; + struct list_head entries; + size_t num_entries; + ktime_t cycle_close_time; + s64 cycle_time; + s64 cycle_time_extension; + s64 base_time; +}; + struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; - s64 base_time; int clockid; - int picos_per_byte; /* Using picoseconds because for 10Gbps+ - * speeds it's sub-nanoseconds per byte - */ - size_t num_entries; + atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; - struct list_head entries; + struct sched_gate_list __rcu *oper_sched; + struct sched_gate_list __rcu *admin_sched; ktime_t (*get_time)(void); struct hrtimer advance_timer; + struct list_head taprio_list; }; +static ktime_t sched_base_time(const struct sched_gate_list *sched) +{ + if (!sched) + return KTIME_MAX; + + return ns_to_ktime(sched->base_time); +} + +static void taprio_free_sched_cb(struct rcu_head *head) +{ + struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); + struct sched_entry *entry, *n; + + if (!sched) + return; + + list_for_each_entry_safe(entry, n, &sched->entries, list) { + list_del(&entry->list); + kfree(entry); + } + + kfree(sched); +} + +static void switch_schedules(struct taprio_sched *q, + struct sched_gate_list **admin, + struct sched_gate_list **oper) +{ + rcu_assign_pointer(q->oper_sched, *admin); + rcu_assign_pointer(q->admin_sched, NULL); + + if (*oper) + call_rcu(&(*oper)->rcu, taprio_free_sched_cb); + + *oper = *admin; + *admin = NULL; +} + +static ktime_t get_cycle_time(struct sched_gate_list *sched) +{ + struct sched_entry *entry; + ktime_t cycle = 0; + + if (sched->cycle_time != 0) + return sched->cycle_time; + + list_for_each_entry(entry, &sched->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + sched->cycle_time = cycle; + + return cycle; +} + static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -85,7 +154,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) rcu_read_lock(); entry = rcu_dereference(q->current_entry); - gate_mask = entry ? entry->gate_mask : -1; + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; rcu_read_unlock(); if (!gate_mask) @@ -107,7 +176,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) - return NULL; + continue; return skb; } @@ -117,18 +186,30 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch) static inline int length_to_duration(struct taprio_sched *q, int len) { - return (len * q->picos_per_byte) / 1000; + return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); +} + +static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) +{ + atomic_set(&entry->budget, + div64_u64((u64)entry->interval * 1000, + atomic64_read(&q->picos_per_byte))); } static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb = NULL; struct sched_entry *entry; - struct sk_buff *skb; u32 gate_mask; int i; + if (atomic64_read(&q->picos_per_byte) == -1) { + WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte."); + return NULL; + } + rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't @@ -137,10 +218,9 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) * "AdminGateSates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - rcu_read_unlock(); if (!gate_mask) - return NULL; + goto done; for (i = 0; i < dev->num_tx_queues; i++) { struct Qdisc *child = q->qdiscs[i]; @@ -171,39 +251,81 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && ktime_after(guard, entry->close_time)) - return NULL; + continue; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && atomic_sub_return(len, &entry->budget) < 0) - return NULL; + continue; skb = child->ops->dequeue(child); if (unlikely(!skb)) - return NULL; + goto done; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; - return skb; + goto done; } - return NULL; +done: + rcu_read_unlock(); + + return skb; } -static bool should_restart_cycle(const struct taprio_sched *q, +static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { - WARN_ON(!entry); + if (list_is_last(&entry->list, &oper->entries)) + return true; + + if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) + return true; + + return false; +} + +static bool should_change_schedules(const struct sched_gate_list *admin, + const struct sched_gate_list *oper, + ktime_t close_time) +{ + ktime_t next_base_time, extension_time; + + if (!admin) + return false; - return list_is_last(&entry->list, &q->entries); + next_base_time = sched_base_time(admin); + + /* This is the simple case, the close_time would fall after + * the next schedule base_time. + */ + if (ktime_compare(next_base_time, close_time) <= 0) + return true; + + /* This is the cycle_time_extension case, if the close_time + * plus the amount that can be extended would fall after the + * next schedule base_time, we can extend the current schedule + * for that amount. + */ + extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); + + /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about + * how precisely the extension should be made. So after + * conformance testing, this logic may change. + */ + if (ktime_compare(next_base_time, extension_time) <= 0) + return true; + + return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); + struct sched_gate_list *oper, *admin; struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t close_time; @@ -211,29 +333,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); + oper = rcu_dereference_protected(q->oper_sched, + lockdep_is_held(&q->current_entry_lock)); + admin = rcu_dereference_protected(q->admin_sched, + lockdep_is_held(&q->current_entry_lock)); - /* This is the case that it's the first time that the schedule - * runs, so it only happens once per schedule. The first entry - * is pre-calculated during the schedule initialization. + if (!oper) + switch_schedules(q, &admin, &oper); + + /* This can happen in two cases: 1. this is the very first run + * of this function (i.e. we weren't running any schedule + * previously); 2. The previous schedule just ended. The first + * entry of all schedules are pre-calculated during the + * schedule initialization. */ - if (unlikely(!entry)) { - next = list_first_entry(&q->entries, struct sched_entry, + if (unlikely(!entry || entry->close_time == oper->base_time)) { + next = list_first_entry(&oper->entries, struct sched_entry, list); close_time = next->close_time; goto first_run; } - if (should_restart_cycle(q, entry)) - next = list_first_entry(&q->entries, struct sched_entry, + if (should_restart_cycle(oper, entry)) { + next = list_first_entry(&oper->entries, struct sched_entry, list); - else + oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, + oper->cycle_time); + } else { next = list_next_entry(entry, list); + } close_time = ktime_add_ns(entry->close_time, next->interval); + close_time = min_t(ktime_t, close_time, oper->cycle_close_time); + + if (should_change_schedules(admin, oper, close_time)) { + /* Set things so the next time this runs, the new + * schedule runs. + */ + close_time = sched_base_time(admin); + switch_schedules(q, &admin, &oper); + } next->close_time = close_time; - atomic_set(&next->budget, - (next->interval * 1000) / q->picos_per_byte); + taprio_set_budget(q, next); first_run: rcu_assign_pointer(q->current_entry, next); @@ -263,10 +405,12 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, - [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, - [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, - [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, - [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, @@ -302,8 +446,8 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; - err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, - entry_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, + entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; @@ -314,70 +458,8 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, return fill_sched_entry(tb, entry, extack); } -/* Returns the number of entries in case of success */ -static int parse_sched_single_entry(struct nlattr *n, - struct taprio_sched *q, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; - struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; - struct sched_entry *entry; - bool found = false; - u32 index; - int err; - - err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX, - n, entry_list_policy, NULL); - if (err < 0) { - NL_SET_ERR_MSG(extack, "Could not parse nested entry"); - return -EINVAL; - } - - if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { - NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); - return -EINVAL; - } - - err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX, - tb_list[TCA_TAPRIO_SCHED_ENTRY], - entry_policy, NULL); - if (err < 0) { - NL_SET_ERR_MSG(extack, "Could not parse nested entry"); - return -EINVAL; - } - - if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { - NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); - return -EINVAL; - } - - index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); - if (index >= q->num_entries) { - NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); - return -EINVAL; - } - - list_for_each_entry(entry, &q->entries, list) { - if (entry->index == index) { - found = true; - break; - } - } - - if (!found) { - NL_SET_ERR_MSG(extack, "Could not find entry"); - return -ENOENT; - } - - err = fill_sched_entry(tb_entry, entry, extack); - if (err < 0) - return err; - - return q->num_entries; -} - static int parse_sched_list(struct nlattr *list, - struct taprio_sched *q, + struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; @@ -407,64 +489,42 @@ static int parse_sched_list(struct nlattr *list, return err; } - list_add_tail(&entry->list, &q->entries); + list_add_tail(&entry->list, &sched->entries); i++; } - q->num_entries = i; + sched->num_entries = i; return i; } -/* Returns the number of entries in case of success */ -static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, - struct netlink_ext_ack *extack) +static int parse_taprio_schedule(struct nlattr **tb, + struct sched_gate_list *new, + struct netlink_ext_ack *extack) { int err = 0; - int clockid; - if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && - tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) - return -EINVAL; - - if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) - return -EINVAL; - - if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) - return -EINVAL; + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { + NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); + return -ENOTSUPP; + } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) - q->base_time = nla_get_s64( - tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); - if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) + new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); - /* We only support static clockids and we don't allow - * for it to be modified after the first init. - */ - if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) - return -EINVAL; - - q->clockid = clockid; - } + if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) + new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list( - tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); - else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) - err = parse_sched_single_entry( - tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); - - /* parse_sched_* return the number of entries in the schedule, - * a schedule with zero entries is an error. - */ - if (err == 0) { - NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); - return -EINVAL; - } + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); + if (err < 0) + return err; - return err; + return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, @@ -473,11 +533,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, { int i, j; - if (!qopt) { + if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } + /* If num_tc is already set, it means that the user already + * configured the mqprio part + */ + if (dev->num_tc) + return 0; + /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) { NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); @@ -523,76 +589,141 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, return 0; } -static ktime_t taprio_get_start_time(struct Qdisc *sch) +static int taprio_get_start_time(struct Qdisc *sch, + struct sched_gate_list *sched, + ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); - struct sched_entry *entry; ktime_t now, base, cycle; s64 n; - base = ns_to_ktime(q->base_time); - cycle = 0; - - /* Calculate the cycle_time, by summing all the intervals. - */ - list_for_each_entry(entry, &q->entries, list) - cycle = ktime_add_ns(cycle, entry->interval); + base = sched_base_time(sched); + now = q->get_time(); - if (!cycle) - return base; + if (ktime_after(base, now)) { + *start = base; + return 0; + } - now = q->get_time(); + cycle = get_cycle_time(sched); - if (ktime_after(base, now)) - return base; + /* The qdisc is expected to have at least one sched_entry. Moreover, + * any entry must have 'interval' > 0. Thus if the cycle time is zero, + * something went really wrong. In that case, we should warn about this + * inconsistent state and return error. + */ + if (WARN_ON(!cycle)) + return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); - - return ktime_add_ns(base, (n + 1) * cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; } -static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +static void setup_first_close_time(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) { - struct taprio_sched *q = qdisc_priv(sch); struct sched_entry *first; - unsigned long flags; + ktime_t cycle; - spin_lock_irqsave(&q->current_entry_lock, flags); + first = list_first_entry(&sched->entries, + struct sched_entry, list); + + cycle = get_cycle_time(sched); - first = list_first_entry(&q->entries, struct sched_entry, - list); + /* FIXME: find a better place to do this */ + sched->cycle_close_time = ktime_add_ns(base, cycle); - first->close_time = ktime_add_ns(start, first->interval); - atomic_set(&first->budget, - (first->interval * 1000) / q->picos_per_byte); + first->close_time = ktime_add_ns(base, first->interval); + taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); +} - spin_unlock_irqrestore(&q->current_entry_lock, flags); +static void taprio_start_sched(struct Qdisc *sch, + ktime_t start, struct sched_gate_list *new) +{ + struct taprio_sched *q = qdisc_priv(sch); + ktime_t expires; + + expires = hrtimer_get_expires(&q->advance_timer); + if (expires == 0) + expires = KTIME_MAX; + + /* If the new schedule starts before the next expiration, we + * reprogram it to the earliest one, so we change the admin + * schedule to the operational one at the right time. + */ + start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } +static void taprio_set_picos_per_byte(struct net_device *dev, + struct taprio_sched *q) +{ + struct ethtool_link_ksettings ecmd; + int picos_per_byte = -1; + + if (!__ethtool_get_link_ksettings(dev, &ecmd) && + ecmd.base.speed != SPEED_UNKNOWN) + picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + ecmd.base.speed * 1000 * 1000); + + atomic64_set(&q->picos_per_byte, picos_per_byte); + netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", + dev->name, (long long)atomic64_read(&q->picos_per_byte), + ecmd.base.speed); +} + +static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net_device *qdev; + struct taprio_sched *q; + bool found = false; + + ASSERT_RTNL(); + + if (event != NETDEV_UP && event != NETDEV_CHANGE) + return NOTIFY_DONE; + + spin_lock(&taprio_list_lock); + list_for_each_entry(q, &taprio_list, taprio_list) { + qdev = qdisc_dev(q->root); + if (qdev == dev) { + found = true; + break; + } + } + spin_unlock(&taprio_list_lock); + + if (found) + taprio_set_picos_per_byte(dev, q); + + return NOTIFY_DONE; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - struct ethtool_link_ksettings ecmd; - int i, err, size; - s64 link_speed; + int i, err, clockid; + unsigned long flags; ktime_t start; - err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, - taprio_policy, extack); + err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, + taprio_policy, extack); if (err < 0) return err; - err = -EINVAL; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); @@ -600,13 +731,78 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; - /* A schedule with less than one entry is an error */ - size = parse_taprio_opt(tb, q, extack); - if (size < 0) - return size; + new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); + if (!new_admin) { + NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); + return -ENOMEM; + } + INIT_LIST_HEAD(&new_admin->entries); - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + rcu_read_unlock(); + + if (mqprio && (oper || admin)) { + NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); + err = -ENOTSUPP; + goto free_sched; + } + + err = parse_taprio_schedule(tb, new_admin, extack); + if (err < 0) + goto free_sched; + + if (new_admin->num_entries == 0) { + NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); + err = -EINVAL; + goto free_sched; + } + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || + (q->clockid != -1 && q->clockid != clockid)) { + NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); + err = -ENOTSUPP; + goto free_sched; + } + + q->clockid = clockid; + } + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); + err = -EINVAL; + goto free_sched; + } + + taprio_set_picos_per_byte(dev, q); + + /* Protects against enqueue()/dequeue() */ + spin_lock_bh(qdisc_lock(sch)); + + if (!hrtimer_active(&q->advance_timer)) { + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; + } + + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } switch (q->clockid) { case CLOCK_REALTIME: @@ -622,65 +818,52 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->get_time = ktime_get_clocktai; break; default: - return -ENOTSUPP; + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + err = -EINVAL; + goto unlock; } - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *dev_queue; - struct Qdisc *qdisc; - - dev_queue = netdev_get_tx_queue(dev, i); - qdisc = qdisc_create_dflt(dev_queue, - &pfifo_qdisc_ops, - TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1)), - extack); - if (!qdisc) - return -ENOMEM; + err = taprio_get_start_time(sch, new_admin, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); + goto unlock; + } - if (i < dev->real_num_tx_queues) - qdisc_hash_add(qdisc, false); + setup_first_close_time(q, new_admin, start); - q->qdiscs[i] = qdisc; - } + /* Protects against advance_sched() */ + spin_lock_irqsave(&q->current_entry_lock, flags); - if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); - for (i = 0; i < mqprio->num_tc; i++) - netdev_set_tc_queue(dev, i, - mqprio->count[i], - mqprio->offset[i]); + taprio_start_sched(sch, start, new_admin); - /* Always use supplied priority mappings */ - for (i = 0; i < TC_BITMASK + 1; i++) - netdev_set_prio_tc_map(dev, i, - mqprio->prio_tc_map[i]); - } + rcu_assign_pointer(q->admin_sched, new_admin); + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); + new_admin = NULL; - if (!__ethtool_get_link_ksettings(dev, &ecmd)) - link_speed = ecmd.base.speed; - else - link_speed = SPEED_1000; + spin_unlock_irqrestore(&q->current_entry_lock, flags); - q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, - link_speed * 1000 * 1000); + err = 0; - start = taprio_get_start_time(sch); - if (!start) - return 0; +unlock: + spin_unlock_bh(qdisc_lock(sch)); - taprio_start_sched(sch, start); +free_sched: + kfree(new_admin); - return 0; + return err; } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct sched_entry *entry, *n; unsigned int i; + spin_lock(&taprio_list_lock); + list_del(&q->taprio_list); + spin_unlock(&taprio_list_lock); + hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { @@ -693,10 +876,11 @@ static void taprio_destroy(struct Qdisc *sch) netdev_set_num_tc(dev, 0); - list_for_each_entry_safe(entry, n, &q->entries, list) { - list_del(&entry->list); - kfree(entry); - } + if (q->oper_sched) + call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); + + if (q->admin_sched) + call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -704,12 +888,12 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + int i; - INIT_LIST_HEAD(&q->entries); spin_lock_init(&q->current_entry_lock); - /* We may overwrite the configuration later */ hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; q->root = sch; @@ -735,6 +919,29 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt) return -EINVAL; + spin_lock(&taprio_list_lock); + list_add(&q->taprio_list, &taprio_list); + spin_unlock(&taprio_list_lock); + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + return taprio_change(sch, opt, extack); } @@ -781,7 +988,7 @@ static int dump_entry(struct sk_buff *msg, { struct nlattr *item; - item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY); + item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; @@ -806,15 +1013,55 @@ nla_put_failure: return -1; } +static int dump_schedule(struct sk_buff *msg, + const struct sched_gate_list *root) +{ + struct nlattr *entry_list; + struct sched_entry *entry; + + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + root->base_time, TCA_TAPRIO_PAD)) + return -1; + + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, + root->cycle_time, TCA_TAPRIO_PAD)) + return -1; + + if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, + root->cycle_time_extension, TCA_TAPRIO_PAD)) + return -1; + + entry_list = nla_nest_start_noflag(msg, + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto error_nest; + + list_for_each_entry(entry, &root->entries, list) { + if (dump_entry(msg, entry) < 0) + goto error_nest; + } + + nla_nest_end(msg, entry_list); + return 0; + +error_nest: + nla_nest_cancel(msg, entry_list); + return -1; +} + static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; - struct nlattr *nest, *entry_list; - struct sched_entry *entry; + struct nlattr *nest, *sched_nest; unsigned int i; + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); + admin = rcu_dereference(q->admin_sched); + opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); @@ -823,36 +1070,45 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) - return -ENOSPC; + goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; - if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, - q->base_time, TCA_TAPRIO_PAD)) + if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; - if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + if (oper && dump_schedule(skb, oper)) goto options_error; - entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); - if (!entry_list) + if (!admin) + goto done; + + sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); + if (!sched_nest) goto options_error; - list_for_each_entry(entry, &q->entries, list) { - if (dump_entry(skb, entry) < 0) - goto options_error; - } + if (dump_schedule(skb, admin)) + goto admin_error; + + nla_nest_end(skb, sched_nest); - nla_nest_end(skb, entry_list); +done: + rcu_read_unlock(); return nla_nest_end(skb, nest); +admin_error: + nla_nest_cancel(skb, sched_nest); + options_error: nla_nest_cancel(skb, nest); - return -1; + +start_error: + rcu_read_unlock(); + return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) @@ -895,7 +1151,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || - gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + qdisc_qstats_copy(d, sch) < 0) return -1; return 0; } @@ -939,6 +1195,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, + .change = taprio_change, .destroy = taprio_destroy, .peek = taprio_peek, .dequeue = taprio_dequeue, @@ -947,14 +1204,24 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; +static struct notifier_block taprio_device_notifier = { + .notifier_call = taprio_dev_notifier, +}; + static int __init taprio_module_init(void) { + int err = register_netdevice_notifier(&taprio_device_notifier); + + if (err) + return err; + return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); + unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 7f272a9070c5..c09c0d855846 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -308,7 +308,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; - err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL); + err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy, + NULL); if (err < 0) return err; @@ -391,8 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, - q->qdisc->qstats.backlog); + qdisc_tree_flush_backlog(q->qdisc); qdisc_put(q->qdisc); q->qdisc = child; } @@ -449,7 +449,7 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_tbf_qopt opt; sch->qstats.backlog = q->qdisc->qstats.backlog; - nest = nla_nest_start(skb, TCA_OPTIONS); + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 39d72e58b8e5..31569f4809f6 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -760,7 +760,6 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; - desc->flags = 0; crypto_shash_digest(desc, (u8 *)auth, end - (unsigned char *)auth, digest); shash_desc_zero(desc); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 6200cd2b4b99..188c47eb206e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1030,6 +1030,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, + .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6abc8b274270..f0631bf486b6 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -600,6 +600,7 @@ out: static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } @@ -1025,6 +1026,7 @@ static const struct proto_ops inet_seqpacket_ops = { .getname = inet_getname, /* Semantics are different. */ .poll = sctp_poll, .ioctl = inet_ioctl, + .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d05c57664e36..72e74503f9fc 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1684,7 +1684,6 @@ static struct sctp_cookie_param *sctp_pack_cookie( /* Sign the message. */ desc->tfm = sctp_sk(ep->base.sk)->hmac; - desc->flags = 0; err = crypto_shash_setkey(desc->tfm, ep->secret_key, sizeof(ep->secret_key)) ?: @@ -1755,7 +1754,6 @@ struct sctp_association *sctp_unpack_cookie( int err; desc->tfm = sctp_sk(ep->base.sk)->hmac; - desc->flags = 0; err = crypto_shash_setkey(desc->tfm, ep->secret_key, sizeof(ep->secret_key)) ?: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 1d143bc3f73d..4aa03588f87b 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1112,32 +1112,6 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, } -/* Sent the next ASCONF packet currently stored in the association. - * This happens after the ASCONF_ACK was succeffully processed. - */ -static void sctp_cmd_send_asconf(struct sctp_association *asoc) -{ - struct net *net = sock_net(asoc->base.sk); - - /* Send the next asconf chunk from the addip chunk - * queue. - */ - if (!list_empty(&asoc->addip_chunk_list)) { - struct list_head *entry = asoc->addip_chunk_list.next; - struct sctp_chunk *asconf = list_entry(entry, - struct sctp_chunk, list); - list_del_init(entry); - - /* Hold the chunk until an ASCONF_ACK is received. */ - sctp_chunk_hold(asconf); - if (sctp_primitive_ASCONF(net, asoc, asconf)) - sctp_chunk_free(asconf); - else - asoc->addip_last_asconf = asconf; - } -} - - /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. @@ -1783,9 +1757,6 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, } sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; - case SCTP_CMD_SEND_NEXT_ASCONF: - sctp_cmd_send_asconf(asoc); - break; case SCTP_CMD_PURGE_ASCONF_QUEUE: sctp_asconf_queue_teardown(asoc); break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c9ae3404b1bb..e3f4abe6134e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3824,6 +3824,29 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, return SCTP_DISPOSITION_CONSUME; } +static enum sctp_disposition sctp_send_next_asconf( + struct net *net, + const struct sctp_endpoint *ep, + struct sctp_association *asoc, + const union sctp_subtype type, + struct sctp_cmd_seq *commands) +{ + struct sctp_chunk *asconf; + struct list_head *entry; + + if (list_empty(&asoc->addip_chunk_list)) + return SCTP_DISPOSITION_CONSUME; + + entry = asoc->addip_chunk_list.next; + asconf = list_entry(entry, struct sctp_chunk, list); + + list_del_init(entry); + sctp_chunk_hold(asconf); + asoc->addip_last_asconf = asconf; + + return sctp_sf_do_prm_asconf(net, ep, asoc, type, asconf, commands); +} + /* * ADDIP Section 4.3 General rules for address manipulation * When building TLV parameters for the ASCONF Chunk that will add or @@ -3915,14 +3938,10 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); if (!sctp_process_asconf_ack((struct sctp_association *)asoc, - asconf_ack)) { - /* Successfully processed ASCONF_ACK. We can - * release the next asconf if we have one. - */ - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF, - SCTP_NULL()); - return SCTP_DISPOSITION_CONSUME; - } + asconf_ack)) + return sctp_send_next_asconf(net, ep, + (struct sctp_association *)asoc, + type, commands); abort = sctp_make_abort(asoc, asconf_ack, sizeof(struct sctp_errhdr)); @@ -6412,13 +6431,15 @@ static int sctp_eat_data(const struct sctp_association *asoc, * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our * memory usage too much */ - if (*sk->sk_prot_creator->memory_pressure) { + if (sk_under_memory_pressure(sk)) { if (sctp_tsnmap_has_gap(map) && (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { pr_debug("%s: under pressure, reneging for tsn:%u\n", __func__, tsn); deliver = SCTP_CMD_RENEGE; - } + } else { + sk_mem_reclaim(sk); + } } /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9874e60c9b0d..e4e892cc5644 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1913,7 +1913,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - if (sctp_wspace(asoc) <= 0) { + if (sk_under_memory_pressure(sk)) + sk_mem_reclaim(sk); + + if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) @@ -4847,7 +4850,8 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, } /* Validate addr_len before calling common connect/connectx routine. */ - af = sctp_get_af_specific(addr->sa_family); + af = addr_len < offsetofend(struct sockaddr, sa_family) ? NULL : + sctp_get_af_specific(addr->sa_family); if (!af || addr_len < af->sockaddr_len) { err = -EINVAL; } else { @@ -8930,7 +8934,10 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, goto do_error; if (signal_pending(current)) goto do_interrupted; - if ((int)msg_len <= sctp_wspace(asoc)) + if (sk_under_memory_pressure(sk)) + sk_mem_reclaim(sk); + if ((int)msg_len <= sctp_wspace(asoc) && + sk_wmem_schedule(sk, msg_len)) break; /* Let another process have a go. Since we are going diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 102c6fefe38c..25e0b7e5189c 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -484,14 +484,15 @@ static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sk_buff_head *skb_list) { - struct sk_buff *skb = sctp_event2skb(event); struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *skb_list; + struct sctp_ulpevent *event; + struct sk_buff *skb; - skb_list = (struct sk_buff_head *)skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || @@ -858,19 +859,24 @@ static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); - if (event && event->msg_flags & MSG_EOR) { + if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_intl_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } return event_eor; @@ -944,20 +950,27 @@ out: static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; + struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); - if (event) - sctp_enqueue_event(ulpq, event); + if (event) { + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_enqueue_event(ulpq, &temp); + } } while (event); } } @@ -1059,7 +1072,7 @@ static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { sctp_intl_retrieve_ordered(ulpq, event); - sctp_enqueue_event(ulpq, event); + sctp_enqueue_event(ulpq, &temp); } } @@ -1298,6 +1311,15 @@ static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) ntohl(skip->mid), skip->flags); } +static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_ulpq_tail_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), @@ -1306,7 +1328,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, - .enqueue_event = sctp_ulpq_tail_event, + .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, @@ -1317,6 +1339,16 @@ static struct sctp_stream_interleave sctp_stream_interleave_0 = { .handle_ftsn = sctp_handle_fwdtsn, }; +static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + return sctp_enqueue_event(ulpq, &temp); +} + static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), @@ -1325,7 +1357,7 @@ static struct sctp_stream_interleave sctp_stream_interleave_1 = { .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, - .enqueue_event = sctp_enqueue_event, + .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8cb7d9858270..c2a7478587ab 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -634,8 +634,9 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event = NULL; - struct sk_buff *skb; - size_t padding, len; + struct sk_buff *skb = chunk->skb; + struct sock *sk = asoc->base.sk; + size_t padding, datalen; int rx_count; /* @@ -646,15 +647,12 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + rx_count = atomic_read(&sk->sk_rmem_alloc); - if (rx_count >= asoc->base.sk->sk_rcvbuf) { + datalen = ntohs(chunk->chunk_hdr->length); - if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb, - chunk->skb->truesize))) - goto fail; - } + if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen)) + goto fail; /* Clone the original skb, sharing the data. */ skb = skb_clone(chunk->skb, gfp); @@ -681,8 +679,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * The sender should never pad with more than 3 bytes. The receiver * MUST ignore the padding bytes. */ - len = ntohs(chunk->chunk_hdr->length); - padding = SCTP_PAD4(len) - len; + padding = SCTP_PAD4(datalen) - datalen; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 5dde92101743..a212fe079c07 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -116,12 +116,13 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { + if (event) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); - event = sctp_ulpq_order(ulpq, event); + if (event->msg_flags & MSG_EOR) + event = sctp_ulpq_order(ulpq, event); } /* Send event to the ULP. 'event' is the sctp_ulpevent for @@ -129,7 +130,7 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, */ if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } return event_eor; @@ -193,18 +194,17 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } -/* If the SKB of 'event' is on a list, it is the first such member - * of that list. - */ -int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); - struct sk_buff_head *queue, *skb_list; - struct sk_buff *skb = sctp_event2skb(event); + struct sctp_ulpevent *event; + struct sk_buff_head *queue; + struct sk_buff *skb; int clear_pd = 0; - skb_list = (struct sk_buff_head *) skb->prev; + skb = __skb_peek(skb_list); + event = sctp_skb2event(skb); /* If the socket is just going to throw this away, do not * even try to deliver it. @@ -257,13 +257,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) } } - /* If we are harvesting multiple skbs they will be - * collected on a list. - */ - if (skb_list) - skb_queue_splice_tail_init(skb_list, queue); - else - __skb_queue_tail(queue, skb); + skb_queue_splice_tail_init(skb_list, queue); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -738,25 +732,25 @@ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) { struct sctp_ulpevent *event = NULL; - struct sk_buff_head temp; if (skb_queue_empty(&ulpq->reasm)) return; while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { - /* Do ordering if needed. */ - if ((event) && (event->msg_flags & MSG_EOR)) { - skb_queue_head_init(&temp); - __skb_queue_tail(&temp, sctp_event2skb(event)); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + /* Do ordering if needed. */ + if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); - } /* Send event to the ULP. 'event' is the * sctp_ulpevent for very first SKB on the temp' list. */ if (event) - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -956,7 +950,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) if (event) { /* see if we have more ordered that we can deliver */ sctp_ulpq_retrieve_ordered(ulpq, event); - sctp_ulpq_tail_event(ulpq, event); + sctp_ulpq_tail_event(ulpq, &temp); } } @@ -1082,7 +1076,11 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { - sctp_ulpq_tail_event(ulpq, event); + struct sk_buff_head temp; + + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + sctp_ulpq_tail_event(ulpq, &temp); sctp_ulpq_set_pd(ulpq); return; } @@ -1106,7 +1104,8 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, freed += sctp_ulpq_renege_frags(ulpq, needed - freed); } /* If able to free enough room, accept this chunk. */ - if (freed >= needed) { + if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) && + freed >= needed) { int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 77ef53596d18..086d9913975d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -134,11 +134,9 @@ static int smc_release(struct socket *sock) smc = smc_sk(sk); /* cleanup for a dangling non-blocking connect */ - if (smc->connect_info && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && sk->sk_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); flush_work(&smc->connect_work); - kfree(smc->connect_info); - smc->connect_info = NULL; if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -167,10 +165,9 @@ static int smc_release(struct socket *sock) if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { - mutex_lock(&smc->clcsock_release_lock); - sock_release(smc->clcsock); - smc->clcsock = NULL; - mutex_unlock(&smc->clcsock_release_lock); + release_sock(sk); + smc_clcsock_release(smc); + lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); @@ -446,12 +443,22 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } +static void smc_switch_to_fallback(struct smc_sock *smc) +{ + smc->use_fallback = true; + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; + } +} + /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = reason_code; smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; @@ -491,46 +498,41 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, mutex_unlock(&smc_client_lgr_pending); smc_conn_free(&smc->conn); + smc->connect_nonblock = 0; return reason_code; } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ -static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) +static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { - int reason_code = 0; - /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, - gid); - if (!(*ibdev)) - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - - return reason_code; + smc_pnet_find_roce_resource(smc->clcsock->sk, ini); + if (!ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ -static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ - smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); - if (!(*ismdev)) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ini); + if (!ini->ism_dev) + return SMC_CLC_DECL_NOSMCDDEV; return 0; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { - if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) - return SMC_CLC_DECL_CNFERR; + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) + return SMC_CLC_DECL_ISMVLANERR; return 0; } @@ -538,12 +540,11 @@ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, - struct smcd_dev *ismdev, - unsigned short vlan_id) + struct smc_init_info *ini) { if (!is_smcd) return 0; - if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } @@ -551,13 +552,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport, - u8 gid[], struct smcd_dev *ismdev) + struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); + rc = smc_clc_send_proposal(smc, smc_type, ini); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -568,23 +568,19 @@ static int smc_connect_clc(struct smc_sock *smc, int smc_type, /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; struct smc_link *link; int reason_code = 0; + ini->is_smcd = false; + ini->ib_lcl = &aclc->lcl; + ini->ib_clcqpn = ntoh24(aclc->qpn); + ini->srv_first_contact = aclc->hdr.flag; + mutex_lock(&smc_client_lgr_pending); - local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, - ibport, ntoh24(aclc->qpn), &aclc->lcl, - NULL, 0); - if (local_contact < 0) { - if (local_contact == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (local_contact == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - else - reason_code = SMC_CLC_DECL_INTERR; /* other error */ + reason_code = smc_conn_create(smc, ini); + if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -594,45 +590,48 @@ static int smc_connect_rdma(struct smc_sock *smc, /* create send buffer and rmb */ if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); - if (local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); if (smc_rmb_rtoken_handling(&smc->conn, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - local_contact); + ini->cln_first_contact); smc_close_init(smc); smc_rx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { if (smc_ib_ready_link(link)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - local_contact); + ini->cln_first_contact); } else { if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - local_contact); + ini->cln_first_contact); } smc_rmb_sync_sg_for_device(&smc->conn); reason_code = smc_clc_send_confirm(smc); if (reason_code) - return smc_connect_abort(smc, reason_code, local_contact); + return smc_connect_abort(smc, reason_code, + ini->cln_first_contact); smc_tx_init(smc); - if (local_contact == SMC_FIRST_CONTACT) { + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, - local_contact); + ini->cln_first_contact); } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -642,23 +641,26 @@ static int smc_connect_rdma(struct smc_sock *smc, /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, - struct smcd_dev *ismdev) + struct smc_init_info *ini) { - int local_contact = SMC_FIRST_CONTACT; int rc = 0; + ini->is_smcd = true; + ini->ism_gid = aclc->gid; + ini->srv_first_contact = aclc->hdr.flag; + /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); - local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, - NULL, ismdev, aclc->gid); - if (local_contact < 0) { + rc = smc_conn_create(smc, ini); + if (rc) { mutex_unlock(&smc_server_lgr_pending); - return SMC_CLC_DECL_MEM; + return rc; } /* Create send and receive buffers */ if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, + ini->cln_first_contact); smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); @@ -667,10 +669,11 @@ static int smc_connect_ism(struct smc_sock *smc, rc = smc_clc_send_confirm(smc); if (rc) - return smc_connect_abort(smc, rc, local_contact); + return smc_connect_abort(smc, rc, ini->cln_first_contact); mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); + smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; @@ -682,13 +685,9 @@ static int __smc_connect(struct smc_sock *smc) { bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; - struct smc_ib_device *ibdev; - struct smcd_dev *ismdev; - u8 gid[SMC_GID_SIZE]; - unsigned short vlan; + struct smc_init_info ini = {0}; int smc_type; int rc = 0; - u8 ibport; sock_hold(&smc->sk); /* sock put in passive closing */ @@ -703,20 +702,21 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check for VLAN ID */ - if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, &ini)) + return smc_connect_decline_fallback(smc, + SMC_CLC_DECL_GETVLANERR); /* check if there is an ism device available */ - if (!smc_check_ism(smc, &ismdev) && - !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + if (!smc_find_ism_device(smc, &ini) && + !smc_connect_ism_vlan_setup(smc, &ini)) { /* ISM is supported for this connection */ ism_supported = true; smc_type = SMC_TYPE_D; } /* check if there is a rdma device available */ - if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { + if (!smc_find_rdma_device(smc, &ini)) { /* RDMA is supported for this connection */ rdma_supported = true; if (ism_supported) @@ -730,25 +730,25 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); + rc = smc_connect_clc(smc, smc_type, &aclc, &ini); if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } /* depending on previous steps, connect using rdma or ism */ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + rc = smc_connect_rdma(smc, &aclc, &ini); else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, ismdev); + rc = smc_connect_ism(smc, &aclc, &ini); else rc = SMC_CLC_DECL_MODEUNSUPP; if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return smc_connect_decline_fallback(smc, rc); } - smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); + smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); return 0; } @@ -756,17 +756,30 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - int rc; + long timeo = smc->sk.sk_sndtimeo; + int rc = 0; - lock_sock(&smc->sk); - rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, - smc->connect_info->alen, smc->connect_info->flags); + if (!timeo) + timeo = MAX_SCHEDULE_TIMEOUT; + lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; - goto out; - } - if (rc < 0) { - smc->sk.sk_err = -rc; + } else if ((1 << smc->clcsock->sk->sk_state) & + (TCPF_SYN_SENT | TCP_SYN_RECV)) { + rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); + if ((rc == -EPIPE) && + ((1 << smc->clcsock->sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) + rc = 0; + } + release_sock(smc->clcsock->sk); + lock_sock(&smc->sk); + if (rc != 0 || smc->sk.sk_err) { + smc->sk.sk_state = SMC_CLOSED; + if (rc == -EPIPE || rc == -EAGAIN) + smc->sk.sk_err = EPIPE; + else if (signal_pending(current)) + smc->sk.sk_err = -sock_intr_errno(timeo); goto out; } @@ -775,12 +788,14 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - if (smc->sk.sk_err) - smc->sk.sk_state_change(&smc->sk); - else - smc->sk.sk_write_space(&smc->sk); - kfree(smc->connect_info); - smc->connect_info = NULL; + if (!sock_flag(&smc->sk, SOCK_DEAD)) { + if (smc->sk.sk_err) { + smc->sk.sk_state_change(&smc->sk); + } else { /* allow polling before and after fallback decision */ + smc->clcsock->sk->sk_write_space(smc->clcsock->sk); + smc->sk.sk_write_space(&smc->sk); + } + } release_sock(&smc->sk); } @@ -813,26 +828,18 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (smc->connect_nonblock) { + rc = -EALREADY; + goto out; + } + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc && rc != -EINPROGRESS) + goto out; if (flags & O_NONBLOCK) { - if (smc->connect_info) { - rc = -EALREADY; - goto out; - } - smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); - if (!smc->connect_info) { - rc = -ENOMEM; - goto out; - } - smc->connect_info->alen = alen; - smc->connect_info->flags = flags ^ O_NONBLOCK; - memcpy(&smc->connect_info->addr, addr, alen); - schedule_work(&smc->connect_work); + if (schedule_work(&smc->connect_work)) + smc->connect_nonblock = 1; rc = -EINPROGRESS; } else { - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; - rc = __smc_connect(smc); if (rc < 0) goto out; @@ -872,11 +879,11 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (rc < 0) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; sock_set_flag(new_sk, SOCK_DEAD); - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -927,16 +934,21 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { + new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } - new_sk->sk_prot->unhash(new_sk); sock_put(new_sk); /* final */ continue; } - if (new_sock) + if (new_sock) { sock_graft(new_sk, new_sock); + if (isk->use_fallback) { + smc_sk(new_sk)->clcsock->file = new_sock->file; + isk->clcsock->file->private_data = isk->clcsock; + } + } return new_sk; } return NULL; @@ -956,6 +968,7 @@ void smc_close_non_accepted(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } + sk->sk_prot->unhash(sk); if (smc->clcsock) { struct socket *tcp; @@ -971,7 +984,6 @@ void smc_close_non_accepted(struct sock *sk) smc_conn_free(&smc->conn); } release_sock(sk); - sk->sk_prot->unhash(sk); sock_put(sk); /* final sock_put */ } @@ -1037,13 +1049,13 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); if (lsmc->sk.sk_state == SMC_LISTEN) { + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); + release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } - release_sock(&lsmc->sk); /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); @@ -1087,7 +1099,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, return; } smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code) < 0) { @@ -1099,7 +1111,7 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, } /* listen worker: check prefixes */ -static int smc_listen_rdma_check(struct smc_sock *new_smc, +static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; @@ -1107,25 +1119,21 @@ static int smc_listen_rdma_check(struct smc_sock *new_smc, pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, - struct smc_ib_device *ibdev, u8 ibport, - int *local_contact) + struct smc_init_info *ini) { + int rc; + /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, - &pclc->lcl, NULL, 0); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) @@ -1137,33 +1145,30 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, - struct smcd_dev *ismdev, - int *local_contact) + struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd; + int rc; pclc_smcd = smc_get_clc_msg_smcd(pclc); - *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, - ismdev, pclc_smcd->gid); - if (*local_contact < 0) { - if (*local_contact == -ENOMEM) - return SMC_CLC_DECL_MEM;/* insufficient memory*/ - return SMC_CLC_DECL_INTERR; /* other error */ - } + ini->ism_gid = pclc_smcd->gid; + rc = smc_conn_create(new_smc, ini); + if (rc) + return rc; /* Check if peer can be reached via ISM device */ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_CNFERR; + return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { - if (*local_contact == SMC_FIRST_CONTACT) + if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_lgr_forget(new_smc->conn.lgr); smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; @@ -1227,15 +1232,13 @@ static void smc_listen_work(struct work_struct *work) struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *ibdev; + struct smc_init_info ini = {0}; bool ism_supported = false; - struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; - int local_contact = 0; - unsigned short vlan; - int reason_code = 0; int rc = 0; - u8 ibport; + + if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) + return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); @@ -1244,7 +1247,7 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; + smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; smc_listen_out_connected(new_smc); return; @@ -1254,17 +1257,26 @@ static void smc_listen_work(struct work_struct *work) * wait for and receive SMC Proposal CLC message */ pclc = (struct smc_clc_msg_proposal *)&buf; - reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, - SMC_CLC_PROPOSAL, CLC_WAIT_TIME); - if (reason_code) { - smc_listen_decline(new_smc, reason_code, 0); - return; - } + rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL, CLC_WAIT_TIME); + if (rc) + goto out_decl; /* IPSec connections opt out of SMC-R optimizations */ if (using_ipsec(new_smc)) { - smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); - return; + rc = SMC_CLC_DECL_IPSEC; + goto out_decl; + } + + /* check for matching IP prefix and subnet length */ + rc = smc_listen_prfx_check(new_smc, pclc); + if (rc) + goto out_decl; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) { + rc = SMC_CLC_DECL_GETVLANERR; + goto out_decl; } mutex_lock(&smc_server_lgr_pending); @@ -1273,59 +1285,73 @@ static void smc_listen_work(struct work_struct *work) smc_tx_init(new_smc); /* check if ISM is available */ - if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && - !smc_check_ism(new_smc, &ismdev) && - !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { - ism_supported = true; + if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { + ini.is_smcd = true; /* prepare ISM check */ + rc = smc_find_ism_device(new_smc, &ini); + if (!rc) + rc = smc_listen_ism_init(new_smc, pclc, &ini); + if (!rc) + ism_supported = true; + else if (pclc->hdr.path == SMC_TYPE_D) + goto out_unlock; /* skip RDMA and decline */ } /* check if RDMA is available */ - if (!ism_supported && - ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || - smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || - smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact))) { - /* SMC not supported, decline */ - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, - local_contact); - return; + if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ + /* prepare RDMA check */ + memset(&ini, 0, sizeof(ini)); + ini.is_smcd = false; + ini.ib_lcl = &pclc->lcl; + rc = smc_find_rdma_device(new_smc, &ini); + if (rc) { + /* no RDMA device found */ + if (pclc->hdr.path == SMC_TYPE_B) + /* neither ISM nor RDMA device found */ + rc = SMC_CLC_DECL_NOSMCDEV; + goto out_unlock; + } + rc = smc_listen_rdma_init(new_smc, &ini); + if (rc) + goto out_unlock; + rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; } /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) { - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, rc, local_contact); - return; - } + rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + if (rc) + goto out_unlock; /* SMC-D does not need this lock any more */ if (ism_supported) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM, CLC_WAIT_TIME); - if (reason_code) { + rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM, CLC_WAIT_TIME); + if (rc) { if (!ism_supported) - mutex_unlock(&smc_server_lgr_pending); - smc_listen_decline(new_smc, reason_code, local_contact); - return; + goto out_unlock; + goto out_decl; } /* finish worker */ if (!ism_supported) { - rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact); + rc = smc_listen_rdma_finish(new_smc, &cclc, + ini.cln_first_contact); mutex_unlock(&smc_server_lgr_pending); if (rc) return; } smc_conn_save_peer_info(new_smc, &cclc); smc_listen_out_connected(new_smc); + return; + +out_unlock: + mutex_unlock(&smc_server_lgr_pending); +out_decl: + smc_listen_decline(new_smc, rc, ini.cln_first_contact); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1501,7 +1527,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { rc = -EINVAL; @@ -1571,8 +1597,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - __poll_t mask = 0; struct smc_sock *smc; + __poll_t mask = 0; if (!sk) return EPOLLNVAL; @@ -1582,8 +1608,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) - mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); @@ -1594,9 +1618,14 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ - mask = smc_accept_poll(sk); + mask |= smc_accept_poll(sk); + } else if (smc->use_fallback) { /* as result of connect_work()*/ + mask |= smc->clcsock->ops->poll(file, smc->clcsock, + wait); + sk->sk_err = smc->clcsock->sk->sk_err; } else { - if (atomic_read(&smc->conn.sndbuf_space) || + if ((sk->sk_state != SMC_INIT && + atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { @@ -1703,7 +1732,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT) { - smc->use_fallback = true; + smc_switch_to_fallback(smc); smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; } else { if (!smc->use_fallback) diff --git a/net/smc/smc.h b/net/smc/smc.h index adbdf195eb08..878313f8d6c1 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -190,18 +190,11 @@ struct smc_connection { u64 peer_token; /* SMC-D token of peer */ }; -struct smc_connect_info { - int flags; - int alen; - struct sockaddr addr; -}; - struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ - struct smc_connect_info *connect_info; /* connect address & flags */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -219,6 +212,10 @@ struct smc_sock { /* smc sock container */ * started, waiting for unsent * data to be sent */ + u8 connect_nonblock : 1; + /* non-blocking connect in + * flight + */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index d53fd588d1f5..745afd82f281 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -385,8 +385,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) /* send CLC PROPOSAL message across internal TCP socket */ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *ibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev) + struct smc_init_info *ini) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; @@ -416,8 +415,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, /* add SMC-R specifics */ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + ETH_ALEN); pclc.iparea_offset = htons(0); } if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { @@ -425,7 +425,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, memset(&pclc_smcd, 0, sizeof(pclc_smcd)); plen += sizeof(pclc_smcd); pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ismdev->local_gid; + pclc_smcd.gid = ini->ism_dev->local_gid; } pclc.hdr.length = htons(plen); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 24658e8c0de4..ca209272e5fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -34,16 +34,22 @@ #define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ #define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */ #define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */ -#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */ +#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ +#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ +#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ +#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ +#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ +#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ +#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ -#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ -#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */ -#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */ +#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ +#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ +#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ +#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -179,6 +185,7 @@ smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) } struct smcd_dev; +struct smc_init_info; int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); @@ -186,8 +193,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_ib_device *smcibdev, u8 ibport, u8 gid[], - struct smcd_dev *ismdev); + struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 2ad37e998509..fc06720b53c1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -21,6 +21,22 @@ #define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ) +/* release the clcsock that is assigned to the smc_sock */ +void smc_clcsock_release(struct smc_sock *smc) +{ + struct socket *tcp; + + if (smc->listen_smc && current_work() != &smc->smc_listen_work) + cancel_work_sync(&smc->smc_listen_work); + mutex_lock(&smc->clcsock_release_lock); + if (smc->clcsock) { + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + mutex_unlock(&smc->clcsock_release_lock); +} + static void smc_close_cleanup_listen(struct sock *parent) { struct sock *sk; @@ -321,6 +337,7 @@ static void smc_close_passive_work(struct work_struct *work) close_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_cdc_conn_state_flags *rxflags; + bool release_clcsock = false; struct sock *sk = &smc->sk; int old_state; @@ -400,13 +417,13 @@ wakeup: if ((sk->sk_state == SMC_CLOSED) && (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { smc_conn_free(conn); - if (smc->clcsock) { - sock_release(smc->clcsock); - smc->clcsock = NULL; - } + if (smc->clcsock) + release_clcsock = true; } } release_sock(sk); + if (release_clcsock) + smc_clcsock_release(smc); sock_put(sk); /* sock_hold done by schedulers of close_work */ } diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index 19eb6a211c23..e0e3b5df25d2 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -23,5 +23,6 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); int smc_close_shutdown_write(struct smc_sock *smc); void smc_close_init(struct smc_sock *smc); +void smc_clcsock_release(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 53a17cfa61af..2d2850adc2a3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -195,10 +195,7 @@ static void smc_lgr_free_work(struct work_struct *work) } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, - struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id, - struct smcd_dev *smcismdev, u64 peer_gid) +static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -206,20 +203,21 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, int rc = 0; int i; - if (is_smcd && vlan_id) { - rc = smc_ism_get_vlan(smcismdev, vlan_id); - if (rc) + if (ini->is_smcd && ini->vlan_id) { + if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { + rc = SMC_CLC_DECL_ISMVLANERR; goto out; + } } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { - rc = -ENOMEM; + rc = SMC_CLC_DECL_MEM; goto out; } - lgr->is_smcd = is_smcd; + lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; - lgr->vlan_id = vlan_id; + lgr->vlan_id = ini->vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); @@ -231,29 +229,32 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - if (is_smcd) { + if (ini->is_smcd) { /* SMC-D specific settings */ - lgr->peer_gid = peer_gid; - lgr->smcd = smcismdev; + lgr->peer_gid = ini->ism_gid; + lgr->smcd = ini->ism_dev; } else { /* SMC-R specific settings */ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = + ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) + smc_ib_setup_per_ibdev(ini->ib_dev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, + &lnk->sgid_index); if (rc) goto free_lgr; rc = smc_llc_link_init(lnk); @@ -289,6 +290,12 @@ clear_llc_lnk: free_lgr: kfree(lgr); out: + if (rc < 0) { + if (rc == -ENOMEM) + rc = SMC_CLC_DECL_MEM; + else + rc = SMC_CLC_DECL_INTERR; + } return rc; } @@ -528,13 +535,13 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; int i, nest_lvl, rc = 0; - *vlan_id = 0; + ini->vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; @@ -546,7 +553,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) ndev = dst->dev; if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } @@ -560,7 +567,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) lower = lower->next; ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); if (is_vlan_dev(ndev)) { - *vlan_id = vlan_dev_vlan_id(ndev); + ini->vlan_id = vlan_dev_vlan_id(ndev); break; } } @@ -594,24 +601,16 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid) +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; - unsigned short vlan_id; enum smc_lgr_role role; int rc = 0; + ini->cln_first_contact = SMC_FIRST_CONTACT; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); - if (rc) - return rc; - - if ((role == SMC_CLNT) && srv_first_contact) + if (role == SMC_CLNT && ini->srv_first_contact) /* create new link group as well */ goto create; @@ -619,14 +618,15 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : - smcr_lgr_match(lgr, lcl, role, clcqpn)) && + if ((ini->is_smcd ? + smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : + smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && !lgr->sync_err && - lgr->vlan_id == vlan_id && + lgr->vlan_id == ini->vlan_id && (role == SMC_CLNT || lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ - local_contact = SMC_REUSE_CONTACT; + ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ if (delayed_work_pending(&lgr->free_work)) @@ -638,19 +638,18 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, } spin_unlock_bh(&smc_lgr_list.lock); - if (role == SMC_CLNT && !srv_first_contact && - (local_contact == SMC_FIRST_CONTACT)) { + if (role == SMC_CLNT && !ini->srv_first_contact && + ini->cln_first_contact == SMC_FIRST_CONTACT) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ - return -ENOLINK; + return SMC_CLC_DECL_SYNCERR; } create: - if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, - lcl->id_for_peer, vlan_id, smcd, peer_gid); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, ini); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ @@ -658,7 +657,7 @@ create: conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; - if (is_smcd) { + if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } @@ -667,7 +666,7 @@ create: #endif out: - return rc ? rc : local_contact; + return rc; } /* convert the RMB size into the compressed notation - minimum 16K. diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8806d2afa6ed..c00ac61dc129 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -229,6 +229,24 @@ struct smc_link_group { }; }; +struct smc_clc_msg_local; + +struct smc_init_info { + u8 is_smcd; + unsigned short vlan_id; + int srv_first_contact; + int cln_first_contact; + /* SMC-R */ + struct smc_clc_msg_local *ib_lcl; + struct smc_ib_device *ib_dev; + u8 ib_gid[SMC_GID_SIZE]; + u8 ib_port; + u32 ib_clcqpn; + /* SMC-D */ + u64 ism_gid; + struct smcd_dev *ism_dev; +}; + /* Find the connection associated with the given alert token in the link group. * To use rbtrees we have to implement our own search core. * Requires @conns_lock @@ -281,13 +299,10 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); -int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); +int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, - struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, - struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, - u64 peer_gid); +int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); void smc_core_exit(void); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 53f429c04843..d14ca4af6f94 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -146,18 +146,13 @@ out: static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport) { const struct ib_gid_attr *attr; - int rc = 0; + int rc; attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0); if (IS_ERR(attr)) return -ENODEV; - if (attr->ndev) - memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr, - ETH_ALEN); - else - rc = -ENODEV; - + rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]); rdma_put_gid_attr(attr); return rc; } @@ -185,6 +180,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index) { const struct ib_gid_attr *attr; + const struct net_device *ndev; int i; for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { @@ -192,11 +188,14 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, if (IS_ERR(attr)) continue; - if (attr->ndev && + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(attr); + if (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(attr->ndev)) || (vlan_id && is_vlan_dev(attr->ndev) && vlan_dev_vlan_id(attr->ndev) == vlan_id)) && attr->gid_type == IB_GID_TYPE_ROCE) { + rcu_read_unlock(); if (gid) memcpy(gid, &attr->gid, SMC_GID_SIZE); if (sgid_index) @@ -204,6 +203,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rdma_put_gid_attr(attr); return 0; } + rcu_read_unlock(); rdma_put_gid_attr(attr); } return -ENODEV; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 2fff79db1a59..e89e918b88e0 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -289,6 +289,11 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->vlan); smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); + if (!smcd->event_wq) { + kfree(smcd->conn); + kfree(smcd); + return NULL; + } return smcd; } EXPORT_SYMBOL_GPL(smcd_alloc_dev); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 8d2f6296279c..bab2da8cf17a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -26,6 +26,7 @@ #include "smc_pnet.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_core.h" #define SMC_ASCII_BLANK 32 @@ -603,35 +604,36 @@ static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - return smc_pnet_remove_by_pnetid(net, NULL); + smc_pnet_remove_by_pnetid(net, NULL); + return 0; } /* SMC_PNETID generic netlink operation definition */ static const struct genl_ops smc_pnet_ops[] = { { .cmd = SMC_PNETID_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_get, .dumpit = smc_pnet_dump, .start = smc_pnet_dump_start }, { .cmd = SMC_PNETID_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_add }, { .cmd = SMC_PNETID_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_del }, { .cmd = SMC_PNETID_FLUSH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = smc_pnet_policy, .doit = smc_pnet_flush } }; @@ -642,6 +644,7 @@ static struct genl_family smc_pnet_nl_family __ro_after_init = { .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, .maxattr = SMC_PNETID_MAX, + .policy = smc_pnet_policy, .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, @@ -758,8 +761,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, u8 gid[]) + struct smc_init_info *ini) { struct smc_ib_device *ibdev; @@ -779,10 +781,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; break; } } @@ -797,9 +799,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, * If nothing found, try to use handshake device */ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, - struct smc_ib_device **smcibdev, - u8 *ibport, unsigned short vlan_id, - u8 gid[]) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smc_ib_device *ibdev; @@ -809,7 +809,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } @@ -820,10 +820,10 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && smc_ib_port_active(ibdev, i) && - !smc_ib_determine_gid(ibdev, i, vlan_id, gid, - NULL)) { - *smcibdev = ibdev; - *ibport = i; + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; goto out; } } @@ -833,7 +833,7 @@ out: } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, - struct smcd_dev **smcismdev) + struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct smcd_dev *ismdev; @@ -847,7 +847,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, spin_lock(&smcd_dev_list.lock); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) { - *smcismdev = ismdev; + ini->ism_dev = ismdev; break; } } @@ -858,21 +858,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, * determine ib_device and port belonging to used internal TCP socket * ethernet interface. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]) +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcibdev = NULL; - *ibport = 0; - + ini->ib_dev = NULL; + ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid); + smc_pnet_find_roce_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); @@ -880,17 +877,17 @@ out: return; } -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - *smcismdev = NULL; + ini->ism_dev = NULL; if (!dst) goto out; if (!dst->dev) goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + smc_pnet_find_ism_by_pnetid(dst->dev, ini); out_rel: dst_release(dst); diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5eac42fb45d0..4564e4d69c2e 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -18,6 +18,7 @@ struct smc_ib_device; struct smcd_dev; +struct smc_init_info; /** * struct smc_pnettable - SMC PNET table anchor @@ -43,9 +44,7 @@ int smc_pnet_init(void) __init; int smc_pnet_net_init(struct net *net); void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport, - unsigned short vlan_id, u8 gid[]); -void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); +void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); +void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); #endif diff --git a/net/socket.c b/net/socket.c index 8255f5bda0aa..472fbefa5d9b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -90,6 +90,7 @@ #include <linux/slab.h> #include <linux/xattr.h> #include <linux/nospec.h> +#include <linux/indirect_call_wrapper.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -108,6 +109,13 @@ #include <net/busy_poll.h> #include <linux/errqueue.h> +/* proto_ops for ipv4 and ipv6 use the same {recv,send}msg function */ +#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET4(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET4(f, f1, ...) f(__VA_ARGS__) +#endif + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -645,10 +653,12 @@ EXPORT_SYMBOL(__sock_tx_timestamp); * Sends @msg through @sock, passing through LSM. * Returns the number of bytes sent, or an error code. */ - +INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *, + size_t)); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); + int ret = INDIRECT_CALL_INET4(sock->ops->sendmsg, inet_sendmsg, sock, + msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -874,11 +884,13 @@ EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); * Receives @msg from @sock, passing through LSM. Returns the total number * of bytes received, or an error. */ - +INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *, + size_t , int )); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags); + return INDIRECT_CALL_INET4(sock->ops->recvmsg, inet_recvmsg, sock, msg, + msg_data_left(msg), flags); } int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) @@ -1164,6 +1176,26 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = open_related_ns(&net->ns, get_net_ns); break; + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: + if (!sock->ops->gettstamp) { + err = -ENOIOCTLCMD; + break; + } + err = sock->ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); + break; + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: + if (!sock->ops->gettstamp) { + err = -ENOIOCTLCMD; + break; + } + err = sock->ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); + break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -2916,38 +2948,6 @@ void socket_seq_show(struct seq_file *seq) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_COMPAT -static int do_siocgstamp(struct net *net, struct socket *sock, - unsigned int cmd, void __user *up) -{ - mm_segment_t old_fs = get_fs(); - struct timeval ktv; - int err; - - set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); - set_fs(old_fs); - if (!err) - err = compat_put_timeval(&ktv, up); - - return err; -} - -static int do_siocgstampns(struct net *net, struct socket *sock, - unsigned int cmd, void __user *up) -{ - mm_segment_t old_fs = get_fs(); - struct timespec kts; - int err; - - set_fs(KERNEL_DS); - err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); - set_fs(old_fs); - if (!err) - err = compat_put_timespec(&kts, up); - - return err; -} - static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) { struct compat_ifconf ifc32; @@ -3347,10 +3347,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDRT: case SIOCDELRT: return routing_ioctl(net, sock, cmd, argp); - case SIOCGSTAMP: - return do_siocgstamp(net, sock, cmd, argp); - case SIOCGSTAMPNS: - return do_siocgstampns(net, sock, cmd, argp); + case SIOCGSTAMP_OLD: + case SIOCGSTAMPNS_OLD: + if (!sock->ops->gettstamp) + return -ENOIOCTLCMD; + return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, + !COMPAT_USE_64BIT_TIME); + case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: @@ -3368,6 +3371,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCADDDLCI: case SIOCDELDLCI: case SIOCGSKNS: + case SIOCGSTAMP_NEW: + case SIOCGSTAMPNS_NEW: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 860dcfb95ee4..e137698e8aef 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -14,7 +14,8 @@ #include <linux/file.h> #include <linux/in.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/poll.h> @@ -140,13 +141,11 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ - if (skb_has_frag_list(head)) { - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - return 0; - } + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + return 0; } if (unlikely(skb_shinfo(head)->frag_list)) { @@ -299,7 +298,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, break; } - /* Positive extra indicates ore bytes than needed for the + /* Positive extra indicates more bytes than needed for the * message */ @@ -547,7 +546,7 @@ void strp_check_rcv(struct strparser *strp) } EXPORT_SYMBOL_GPL(strp_check_rcv); -static int __init strp_mod_init(void) +static int __init strp_dev_init(void) { strp_wq = create_singlethread_workqueue("kstrp"); if (unlikely(!strp_wq)) @@ -555,11 +554,4 @@ static int __init strp_mod_init(void) return 0; } - -static void __exit strp_mod_exit(void) -{ - destroy_workqueue(strp_wq); -} -module_init(strp_mod_init); -module_exit(strp_mod_exit); -MODULE_LICENSE("GPL"); +device_initcall(strp_dev_init); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 3fd56c0c90ae..4ce42c62458e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -269,6 +269,7 @@ err: struct gss_upcall_msg { refcount_t count; kuid_t uid; + const char *service_name; struct rpc_pipe_msg msg; struct list_head list; struct gss_auth *auth; @@ -316,6 +317,7 @@ gss_release_msg(struct gss_upcall_msg *gss_msg) gss_put_ctx(gss_msg->ctx); rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue); gss_put_auth(gss_msg->auth); + kfree_const(gss_msg->service_name); kfree(gss_msg); } @@ -410,9 +412,12 @@ gss_upcall_callback(struct rpc_task *task) gss_release_msg(gss_msg); } -static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) +static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg, + const struct cred *cred) { - uid_t uid = from_kuid(&init_user_ns, gss_msg->uid); + struct user_namespace *userns = cred->user_ns; + + uid_t uid = from_kuid_munged(userns, gss_msg->uid); memcpy(gss_msg->databuf, &uid, sizeof(uid)); gss_msg->msg.data = gss_msg->databuf; gss_msg->msg.len = sizeof(uid); @@ -420,17 +425,31 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg) BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf)); } +static ssize_t +gss_v0_upcall(struct file *file, struct rpc_pipe_msg *msg, + char __user *buf, size_t buflen) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, + struct gss_upcall_msg, + msg); + if (msg->copied == 0) + gss_encode_v0_msg(gss_msg, file->f_cred); + return rpc_pipe_generic_upcall(file, msg, buf, buflen); +} + static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, const char *service_name, - const char *target_name) + const char *target_name, + const struct cred *cred) { + struct user_namespace *userns = cred->user_ns; struct gss_api_mech *mech = gss_msg->auth->mech; char *p = gss_msg->databuf; size_t buflen = sizeof(gss_msg->databuf); int len; len = scnprintf(p, buflen, "mech=%s uid=%d", mech->gm_name, - from_kuid(&init_user_ns, gss_msg->uid)); + from_kuid_munged(userns, gss_msg->uid)); buflen -= len; p += len; gss_msg->msg.len = len; @@ -491,6 +510,25 @@ out_overflow: return -ENOMEM; } +static ssize_t +gss_v1_upcall(struct file *file, struct rpc_pipe_msg *msg, + char __user *buf, size_t buflen) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, + struct gss_upcall_msg, + msg); + int err; + if (msg->copied == 0) { + err = gss_encode_v1_msg(gss_msg, + gss_msg->service_name, + gss_msg->auth->target_name, + file->f_cred); + if (err) + return err; + } + return rpc_pipe_generic_upcall(file, msg, buf, buflen); +} + static struct gss_upcall_msg * gss_alloc_msg(struct gss_auth *gss_auth, kuid_t uid, const char *service_name) @@ -513,16 +551,14 @@ gss_alloc_msg(struct gss_auth *gss_auth, refcount_set(&gss_msg->count, 1); gss_msg->uid = uid; gss_msg->auth = gss_auth; - switch (vers) { - case 0: - gss_encode_v0_msg(gss_msg); - break; - default: - err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name); - if (err) + kref_get(&gss_auth->kref); + if (service_name) { + gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS); + if (!gss_msg->service_name) { + err = -ENOMEM; goto err_put_pipe_version; + } } - kref_get(&gss_auth->kref); return gss_msg; err_put_pipe_version: put_pipe_version(gss_auth->net); @@ -581,8 +617,8 @@ gss_refresh_upcall(struct rpc_task *task) /* XXX: warning on the first, under the assumption we * shouldn't normally hit this case on a refresh. */ warn_gssd(); - task->tk_timeout = 15*HZ; - rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL); + rpc_sleep_on_timeout(&pipe_version_rpc_waitqueue, + task, NULL, jiffies + (15 * HZ)); err = -EAGAIN; goto out; } @@ -595,7 +631,6 @@ gss_refresh_upcall(struct rpc_task *task) if (gss_cred->gc_upcall != NULL) rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL); else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { - task->tk_timeout = 0; gss_cred->gc_upcall = gss_msg; /* gss_upcall_callback will release the reference to gss_upcall_msg */ refcount_inc(&gss_msg->count); @@ -707,7 +742,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) goto err; } - uid = make_kuid(&init_user_ns, id); + uid = make_kuid(current_user_ns(), id); if (!uid_valid(uid)) { err = -EINVAL; goto err; @@ -2116,7 +2151,7 @@ static const struct rpc_credops gss_nullops = { }; static const struct rpc_pipe_ops gss_upcall_ops_v0 = { - .upcall = rpc_pipe_generic_upcall, + .upcall = gss_v0_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v0, @@ -2124,7 +2159,7 @@ static const struct rpc_pipe_ops gss_upcall_ops_v0 = { }; static const struct rpc_pipe_ops gss_upcall_ops_v1 = { - .upcall = rpc_pipe_generic_upcall, + .upcall = gss_v1_upcall, .downcall = gss_pipe_downcall, .destroy_msg = gss_pipe_destroy_msg, .open_pipe = gss_pipe_open_v1, diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 4f43383971ba..6f2d30d7b766 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -977,7 +977,6 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, } desc->tfm = hmac; - desc->flags = 0; /* Compute intermediate Kseq from session key */ err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength); @@ -1045,7 +1044,6 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, } desc->tfm = hmac; - desc->flags = 0; /* Compute intermediate Kcrypt from session key */ for (i = 0; i < kctx->gk5e->keylength; i++) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 56cc85c5bc06..6e5d6d240215 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -438,7 +438,6 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) } desc->tfm = hmac; - desc->flags = 0; err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum); kzfree(desc); diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index d4018e5a24c5..e7df1f782b2e 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -107,6 +107,8 @@ unx_marshal(struct rpc_task *task, struct xdr_stream *xdr) __be32 *p, *cred_len, *gidarr_len; int i; struct group_info *gi = cred->cr_cred->group_info; + struct user_namespace *userns = clnt->cl_cred ? + clnt->cl_cred->user_ns : &init_user_ns; /* Credential */ @@ -122,14 +124,13 @@ unx_marshal(struct rpc_task *task, struct xdr_stream *xdr) p = xdr_reserve_space(xdr, 3 * sizeof(*p)); if (!p) goto marshal_failed; - *p++ = cpu_to_be32(from_kuid(&init_user_ns, cred->cr_cred->fsuid)); - *p++ = cpu_to_be32(from_kgid(&init_user_ns, cred->cr_cred->fsgid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, cred->cr_cred->fsuid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, cred->cr_cred->fsgid)); gidarr_len = p++; if (gi) for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++) - *p++ = cpu_to_be32(from_kgid(&init_user_ns, - gi->gid[i])); + *p++ = cpu_to_be32(from_kgid_munged(userns, gi->gid[i])); *gidarr_len = cpu_to_be32(p - gidarr_len - 1); *cred_len = cpu_to_be32((p - cred_len - 1) << 2); p = xdr_reserve_space(xdr, (p - gidarr_len - 1) << 2); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 187d10443a15..c1f1afabd024 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -394,6 +394,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (err) goto out_no_clid; + clnt->cl_cred = get_cred(args->cred); clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; @@ -439,6 +440,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, out_no_path: rpc_free_iostats(clnt->cl_metrics); out_no_stats: + put_cred(clnt->cl_cred); rpc_free_clid(clnt); out_no_clid: kfree(clnt); @@ -484,8 +486,11 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, } clnt->cl_softrtry = 1; - if (args->flags & RPC_CLNT_CREATE_HARDRTRY) + if (args->flags & (RPC_CLNT_CREATE_HARDRTRY|RPC_CLNT_CREATE_SOFTERR)) { clnt->cl_softrtry = 0; + if (args->flags & RPC_CLNT_CREATE_SOFTERR) + clnt->cl_softerr = 1; + } if (args->flags & RPC_CLNT_CREATE_AUTOBIND) clnt->cl_autobind = 1; @@ -623,10 +628,12 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, /* Turn off autobind on clones */ new->cl_autobind = 0; new->cl_softrtry = clnt->cl_softrtry; + new->cl_softerr = clnt->cl_softerr; new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; + new->cl_cred = get_cred(clnt->cl_cred); return new; out_err: @@ -648,6 +655,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, + .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } @@ -669,6 +677,7 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = flavor, + .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } @@ -827,14 +836,8 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) * Spin lock all_tasks to prevent changes... */ spin_lock(&clnt->cl_lock); - list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) { - if (!RPC_IS_ACTIVATED(rovr)) - continue; - if (!(rovr->tk_flags & RPC_TASK_KILLED)) { - rovr->tk_flags |= RPC_TASK_KILLED; - rpc_exit(rovr, -EIO); - } - } + list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) + rpc_signal_task(rovr); spin_unlock(&clnt->cl_lock); } EXPORT_SYMBOL_GPL(rpc_killall_tasks); @@ -882,6 +885,7 @@ rpc_free_client(struct rpc_clnt *clnt) xprt_put(rcu_dereference_raw(clnt->cl_xprt)); xprt_iter_destroy(&clnt->cl_xpi); rpciod_down(); + put_cred(clnt->cl_cred); rpc_free_clid(clnt); kfree(clnt); return parent; @@ -946,6 +950,7 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, .prognumber = program->number, .version = vers, .authflavor = old->cl_auth->au_flavor, + .cred = old->cl_cred, }; struct rpc_clnt *clnt; int err; @@ -1007,6 +1012,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_softerr) + task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; if (atomic_read(&clnt->cl_swapper)) @@ -1470,22 +1477,14 @@ void rpc_force_rebind(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_force_rebind); -/* - * Restart an (async) RPC call from the call_prepare state. - * Usually called from within the exit handler. - */ -int -rpc_restart_call_prepare(struct rpc_task *task) +static int +__rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *)) { - if (RPC_ASSASSINATED(task)) - return 0; - task->tk_action = call_start; task->tk_status = 0; - if (task->tk_ops->rpc_call_prepare != NULL) - task->tk_action = rpc_prepare_task; + task->tk_rpc_status = 0; + task->tk_action = action; return 1; } -EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); /* * Restart an (async) RPC call. Usually called from within the @@ -1494,14 +1493,23 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); int rpc_restart_call(struct rpc_task *task) { - if (RPC_ASSASSINATED(task)) - return 0; - task->tk_action = call_start; - task->tk_status = 0; - return 1; + return __rpc_restart_call(task, call_start); } EXPORT_SYMBOL_GPL(rpc_restart_call); +/* + * Restart an (async) RPC call from the call_prepare state. + * Usually called from within the exit handler. + */ +int +rpc_restart_call_prepare(struct rpc_task *task) +{ + if (task->tk_ops->rpc_call_prepare != NULL) + return __rpc_restart_call(task, rpc_prepare_task); + return rpc_restart_call(task); +} +EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); + const char *rpc_proc_name(const struct rpc_task *task) { @@ -1516,6 +1524,19 @@ const char return "no proc"; } +static void +__rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) +{ + task->tk_rpc_status = rpc_status; + rpc_exit(task, tk_status); +} + +static void +rpc_call_rpcerror(struct rpc_task *task, int status) +{ + __rpc_call_rpcerror(task, status, status); +} + /* * 0. Initial state * @@ -1540,7 +1561,6 @@ call_start(struct rpc_task *task) clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; rpc_task_set_transport(task, clnt); - call_reserve(task); } /* @@ -1554,9 +1574,6 @@ call_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } static void call_retry_reserve(struct rpc_task *task); @@ -1579,13 +1596,12 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; - call_refresh(task); return; } printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", __func__, status); - rpc_exit(task, -EIO); + rpc_call_rpcerror(task, -EIO); return; } @@ -1605,7 +1621,6 @@ call_reserveresult(struct rpc_task *task) /* fall through */ case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; - call_retry_reserve(task); return; case -EIO: /* probably a shutdown */ break; @@ -1614,7 +1629,7 @@ call_reserveresult(struct rpc_task *task) __func__, status); break; } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } /* @@ -1628,9 +1643,6 @@ call_retry_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); - if (rpc_task_need_resched(task)) - return; - call_reserveresult(task); } /* @@ -1645,9 +1657,6 @@ call_refresh(struct rpc_task *task) task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; rpcauth_refreshcred(task); - if (rpc_task_need_resched(task)) - return; - call_refreshresult(task); } /* @@ -1666,7 +1675,6 @@ call_refreshresult(struct rpc_task *task) case 0: if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - call_allocate(task); return; } /* Use rate-limiting and a max number of retries if refresh @@ -1685,12 +1693,11 @@ call_refreshresult(struct rpc_task *task) task->tk_cred_retry--; dprintk("RPC: %5u %s: retry refresh creds\n", task->tk_pid, __func__); - call_refresh(task); return; } dprintk("RPC: %5u %s: refresh creds failed with error %d\n", task->tk_pid, __func__, status); - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } /* @@ -1711,10 +1718,8 @@ call_allocate(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_encode; - if (req->rq_buffer) { - call_encode(task); + if (req->rq_buffer) return; - } if (proc->p_proc != 0) { BUG_ON(proc->p_arglen == 0); @@ -1740,14 +1745,10 @@ call_allocate(struct rpc_task *task) status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); - if (status == 0) { - if (rpc_task_need_resched(task)) - return; - call_encode(task); + if (status == 0) return; - } if (status != -ENOMEM) { - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; } @@ -1813,10 +1814,17 @@ call_encode(struct rpc_task *task) rpc_delay(task, HZ >> 4); break; case -EKEYEXPIRED: - task->tk_action = call_refresh; + if (!task->tk_cred_retry) { + rpc_exit(task, task->tk_status); + } else { + task->tk_action = call_refresh; + task->tk_cred_retry--; + dprintk("RPC: %5u %s: retry refresh creds\n", + task->tk_pid, __func__); + } break; default: - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } return; } else { @@ -1828,8 +1836,12 @@ call_encode(struct rpc_task *task) xprt_request_enqueue_receive(task); xprt_request_enqueue_transmit(task); out: - task->tk_action = call_bind; - call_bind(task); + task->tk_action = call_transmit; + /* Check that the connection is OK */ + if (!xprt_bound(task->tk_xprt)) + task->tk_action = call_bind; + else if (!xprt_connected(task->tk_xprt)) + task->tk_action = call_connect; } /* @@ -1847,7 +1859,6 @@ rpc_task_handle_transmitted(struct rpc_task *task) { xprt_end_transmit(task); task->tk_action = call_transmit_status; - call_transmit_status(task); } /* @@ -1865,7 +1876,6 @@ call_bind(struct rpc_task *task) if (xprt_bound(xprt)) { task->tk_action = call_connect; - call_connect(task); return; } @@ -1875,7 +1885,6 @@ call_bind(struct rpc_task *task) if (!xprt_prepare_transmit(task)) return; - task->tk_timeout = xprt->bind_timeout; xprt->ops->rpcbind(task); } @@ -1896,7 +1905,6 @@ call_bind_status(struct rpc_task *task) dprint_status(task); task->tk_status = 0; task->tk_action = call_connect; - call_connect(task); return; } @@ -1957,7 +1965,7 @@ call_bind_status(struct rpc_task *task) task->tk_pid, -task->tk_status); } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; retry_timeout: @@ -1981,7 +1989,6 @@ call_connect(struct rpc_task *task) if (xprt_connected(xprt)) { task->tk_action = call_transmit; - call_transmit(task); return; } @@ -1993,7 +2000,7 @@ call_connect(struct rpc_task *task) if (task->tk_status < 0) return; if (task->tk_flags & RPC_TASK_NOCONNECT) { - rpc_exit(task, -ENOTCONN); + rpc_call_rpcerror(task, -ENOTCONN); return; } if (!xprt_prepare_transmit(task)) @@ -2051,10 +2058,9 @@ call_connect_status(struct rpc_task *task) case 0: clnt->cl_stats->netreconn++; task->tk_action = call_transmit; - call_transmit(task); return; } - rpc_exit(task, status); + rpc_call_rpcerror(task, status); return; out_retry: /* Check for timeouts before looping back to call_bind */ @@ -2087,9 +2093,6 @@ call_transmit(struct rpc_task *task) xprt_transmit(task); } xprt_end_transmit(task); - if (rpc_task_need_resched(task)) - return; - call_transmit_status(task); } /* @@ -2105,11 +2108,8 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (rpc_task_transmitted(task)) { - if (task->tk_status == 0) - xprt_request_wait_receive(task); - if (rpc_task_need_resched(task)) - return; - call_status(task); + task->tk_status = 0; + xprt_request_wait_receive(task); return; } @@ -2145,7 +2145,7 @@ call_transmit_status(struct rpc_task *task) if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); return; } /* fall through */ @@ -2170,7 +2170,6 @@ call_bc_encode(struct rpc_task *task) { xprt_request_enqueue_transmit(task); task->tk_action = call_bc_transmit; - call_bc_transmit(task); } /* @@ -2195,6 +2194,9 @@ call_bc_transmit_status(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_transmitted(task)) + task->tk_status = 0; + dprint_status(task); switch (task->tk_status) { @@ -2261,7 +2263,6 @@ call_status(struct rpc_task *task) status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; - call_decode(task); return; } @@ -2308,7 +2309,7 @@ call_status(struct rpc_task *task) rpc_check_timeout(task); return; out_exit: - rpc_exit(task, status); + rpc_call_rpcerror(task, status); } static bool @@ -2332,29 +2333,40 @@ rpc_check_timeout(struct rpc_task *task) task->tk_timeouts++; if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { - rpc_exit(task, -ETIMEDOUT); + rpc_call_rpcerror(task, -ETIMEDOUT); return; } if (RPC_IS_SOFT(task)) { + /* + * Once a "no retrans timeout" soft tasks (a.k.a NFSv4) has + * been sent, it should time out only if the transport + * connection gets terminally broken. + */ + if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) && + rpc_check_connected(task->tk_rqstp)) + return; + if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + pr_notice_ratelimited( + "%s: server %s not responding, timed out\n", clnt->cl_program->name, task->tk_xprt->servername); } if (task->tk_flags & RPC_TASK_TIMEOUT) - rpc_exit(task, -ETIMEDOUT); + rpc_call_rpcerror(task, -ETIMEDOUT); else - rpc_exit(task, -EIO); + __rpc_call_rpcerror(task, -EIO, -ETIMEDOUT); return; } if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_program->name, - task->tk_xprt->servername); + pr_notice_ratelimited( + "%s: server %s not responding, still trying\n", + clnt->cl_program->name, + task->tk_xprt->servername); } } rpc_force_rebind(clnt); @@ -2384,7 +2396,7 @@ call_decode(struct rpc_task *task) if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { - printk(KERN_NOTICE "%s: server %s OK\n", + pr_notice_ratelimited("%s: server %s OK\n", clnt->cl_program->name, task->tk_xprt->servername); } @@ -2907,7 +2919,7 @@ static void rpc_show_task(const struct rpc_clnt *clnt, printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n", task->tk_pid, task->tk_flags, task->tk_status, - clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops, + clnt, task->tk_rqstp, rpc_task_timeout(task), task->tk_ops, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), task->tk_action, rpc_waitq); } diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 19bb356230ed..95ebd76b132d 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -33,7 +33,7 @@ tasks_show(struct seq_file *f, void *v) seq_printf(f, "%5u %04x %6d 0x%x 0x%x %8ld %ps %sv%u %s a:%ps q:%s\n", task->tk_pid, task->tk_flags, task->tk_status, - clnt->cl_clid, xid, task->tk_timeout, task->tk_ops, + clnt->cl_clid, xid, rpc_task_timeout(task), task->tk_ops, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), task->tk_action, rpc_waitq); return 0; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 69663681bf9d..979d23646e33 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -202,18 +202,11 @@ rpc_alloc_inode(struct super_block *sb) } static void -rpc_i_callback(struct rcu_head *head) +rpc_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); } -static void -rpc_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, rpc_i_callback); -} - static int rpc_pipe_open(struct inode *inode, struct file *filp) { @@ -1123,7 +1116,7 @@ void rpc_remove_cache_dir(struct dentry *dentry) */ static const struct super_operations s_ops = { .alloc_inode = rpc_alloc_inode, - .destroy_inode = rpc_destroy_inode, + .free_inode = rpc_free_inode, .statfs = simple_statfs, }; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 41a971ac1c63..2277b7cdad27 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -240,6 +240,7 @@ static int rpcb_create_local_unix(struct net *net) .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_NULL, + .cred = current_cred(), /* * We turn off the idle timeout to prevent the kernel * from automatically disconnecting the socket. @@ -299,6 +300,7 @@ static int rpcb_create_local_net(struct net *net) .program = &rpcb_program, .version = RPCBVERS_2, .authflavor = RPC_AUTH_UNIX, + .cred = current_cred(), .flags = RPC_CLNT_CREATE_NOPING, }; struct rpc_clnt *clnt, *clnt4; @@ -358,7 +360,8 @@ out: static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, - int proto, u32 version) + int proto, u32 version, + const struct cred *cred) { struct rpc_create_args args = { .net = net, @@ -370,6 +373,7 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, .program = &rpcb_program, .version = version, .authflavor = RPC_AUTH_UNIX, + .cred = cred, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_NONPRIVPORT), }; @@ -694,7 +698,8 @@ void rpcb_getport_async(struct rpc_task *task) /* Put self on the wait queue to ensure we get notified if * some other task is already attempting to bind the port */ - rpc_sleep_on(&xprt->binding, task, NULL); + rpc_sleep_on_timeout(&xprt->binding, task, + NULL, jiffies + xprt->bind_timeout); if (xprt_test_and_set_binding(xprt)) { dprintk("RPC: %5u %s: waiting for another binder\n", @@ -744,7 +749,8 @@ void rpcb_getport_async(struct rpc_task *task) rpcb_clnt = rpcb_create(xprt->xprt_net, clnt->cl_nodename, xprt->servername, sap, salen, - xprt->prot, bind_version); + xprt->prot, bind_version, + clnt->cl_cred); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 28956c70100a..1a12fb03e611 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -58,6 +58,20 @@ static struct rpc_wait_queue delay_queue; struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; +unsigned long +rpc_task_timeout(const struct rpc_task *task) +{ + unsigned long timeout = READ_ONCE(task->tk_timeout); + + if (timeout != 0) { + unsigned long now = jiffies; + if (time_before(now, timeout)) + return timeout - now; + } + return 0; +} +EXPORT_SYMBOL_GPL(rpc_task_timeout); + /* * Disable the timer for a given RPC task. Should be called with * queue->lock and bh_disabled in order to avoid races within @@ -66,7 +80,7 @@ struct workqueue_struct *xprtiod_workqueue __read_mostly; static void __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (task->tk_timeout == 0) + if (list_empty(&task->u.tk_wait.timer_list)) return; dprintk("RPC: %5u disabling timer\n", task->tk_pid); task->tk_timeout = 0; @@ -78,25 +92,21 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task) static void rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires) { - queue->timer_list.expires = expires; - mod_timer(&queue->timer_list.timer, expires); + timer_reduce(&queue->timer_list.timer, expires); } /* * Set up a timer for the current task. */ static void -__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) +__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task, + unsigned long timeout) { - if (!task->tk_timeout) - return; - dprintk("RPC: %5u setting alarm for %u ms\n", - task->tk_pid, jiffies_to_msecs(task->tk_timeout)); + task->tk_pid, jiffies_to_msecs(timeout - jiffies)); - task->u.tk_wait.expires = jiffies + task->tk_timeout; - if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) - rpc_set_queue_timer(queue, task->u.tk_wait.expires); + task->tk_timeout = timeout; + rpc_set_queue_timer(queue, timeout); list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } @@ -188,6 +198,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, if (RPC_IS_QUEUED(task)) return; + INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); else if (RPC_IS_SWAPPER(task)) @@ -238,7 +249,9 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); queue->qlen = 0; - timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0); + timer_setup(&queue->timer_list.timer, + __rpc_queue_timer_fn, + TIMER_DEFERRABLE); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } @@ -362,7 +375,6 @@ static void rpc_make_runnable(struct workqueue_struct *wq, */ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, unsigned char queue_priority) { dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", @@ -372,47 +384,100 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, __rpc_add_wait_queue(q, task, queue_priority); - WARN_ON_ONCE(task->tk_callback != NULL); - task->tk_callback = action; - __rpc_add_timer(q, task); } -void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action) +static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, + struct rpc_task *task, unsigned long timeout, + unsigned char queue_priority) +{ + if (time_is_after_jiffies(timeout)) { + __rpc_sleep_on_priority(q, task, queue_priority); + __rpc_add_timer(q, task, timeout); + } else + task->tk_status = -ETIMEDOUT; +} + +static void rpc_set_tk_callback(struct rpc_task *task, rpc_action action) +{ + if (action && !WARN_ON_ONCE(task->tk_callback != NULL)) + task->tk_callback = action; +} + +static bool rpc_sleep_check_activated(struct rpc_task *task) { /* We shouldn't ever put an inactive task to sleep */ - WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); - if (!RPC_IS_ACTIVATED(task)) { + if (WARN_ON_ONCE(!RPC_IS_ACTIVATED(task))) { task->tk_status = -EIO; rpc_put_task_async(task); - return; + return false; } + return true; +} + +void rpc_sleep_on_timeout(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, unsigned long timeout) +{ + if (!rpc_sleep_check_activated(task)) + return; + + rpc_set_tk_callback(task, action); + + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on_priority_timeout(q, task, timeout, task->tk_priority); + spin_unlock_bh(&q->lock); +} +EXPORT_SYMBOL_GPL(rpc_sleep_on_timeout); +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action) +{ + if (!rpc_sleep_check_activated(task)) + return; + + rpc_set_tk_callback(task, action); + + WARN_ON_ONCE(task->tk_timeout != 0); /* * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on_priority(q, task, action, task->tk_priority); + __rpc_sleep_on_priority(q, task, task->tk_priority); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on); +void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, + struct rpc_task *task, unsigned long timeout, int priority) +{ + if (!rpc_sleep_check_activated(task)) + return; + + priority -= RPC_PRIORITY_LOW; + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on_priority_timeout(q, task, timeout, priority); + spin_unlock_bh(&q->lock); +} +EXPORT_SYMBOL_GPL(rpc_sleep_on_priority_timeout); + void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, - rpc_action action, int priority) + int priority) { - /* We shouldn't ever put an inactive task to sleep */ - WARN_ON_ONCE(!RPC_IS_ACTIVATED(task)); - if (!RPC_IS_ACTIVATED(task)) { - task->tk_status = -EIO; - rpc_put_task_async(task); + if (!rpc_sleep_check_activated(task)) return; - } + WARN_ON_ONCE(task->tk_timeout != 0); + priority -= RPC_PRIORITY_LOW; /* * Protect the queue operations. */ spin_lock_bh(&q->lock); - __rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW); + __rpc_sleep_on_priority(q, task, priority); spin_unlock_bh(&q->lock); } EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); @@ -704,7 +769,7 @@ static void __rpc_queue_timer_fn(struct timer_list *t) spin_lock(&queue->lock); expires = now = jiffies; list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) { - timeo = task->u.tk_wait.expires; + timeo = task->tk_timeout; if (time_after_eq(now, timeo)) { dprintk("RPC: %5u timeout\n", task->tk_pid); task->tk_status = -ETIMEDOUT; @@ -730,8 +795,7 @@ static void __rpc_atrun(struct rpc_task *task) */ void rpc_delay(struct rpc_task *task, unsigned long delay) { - task->tk_timeout = delay; - rpc_sleep_on(&delay_queue, task, __rpc_atrun); + rpc_sleep_on_timeout(&delay_queue, task, __rpc_atrun, jiffies + delay); } EXPORT_SYMBOL_GPL(rpc_delay); @@ -759,8 +823,7 @@ static void rpc_reset_task_statistics(struct rpc_task *task) { task->tk_timeouts = 0; - task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_KILLED|RPC_TASK_SENT); - + task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_SENT); rpc_init_task_statistics(task); } @@ -773,7 +836,6 @@ void rpc_exit_task(struct rpc_task *task) if (task->tk_ops->rpc_call_done != NULL) { task->tk_ops->rpc_call_done(task, task->tk_calldata); if (task->tk_action != NULL) { - WARN_ON(RPC_ASSASSINATED(task)); /* Always release the RPC slot and buffer memory */ xprt_release(task); rpc_reset_task_statistics(task); @@ -781,6 +843,19 @@ void rpc_exit_task(struct rpc_task *task) } } +void rpc_signal_task(struct rpc_task *task) +{ + struct rpc_wait_queue *queue; + + if (!RPC_IS_ACTIVATED(task)) + return; + set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + smp_mb__after_atomic(); + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS); +} + void rpc_exit(struct rpc_task *task, int status) { task->tk_status = status; @@ -836,6 +911,13 @@ static void __rpc_execute(struct rpc_task *task) */ if (!RPC_IS_QUEUED(task)) continue; + + /* + * Signalled tasks should exit rather than sleep. + */ + if (RPC_SIGNALLED(task)) + rpc_exit(task, -ERESTARTSYS); + /* * The queue->lock protects against races with * rpc_make_runnable(). @@ -861,7 +943,7 @@ static void __rpc_execute(struct rpc_task *task) status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, TASK_KILLABLE); - if (status == -ERESTARTSYS) { + if (status < 0) { /* * When a sync task receives a signal, it exits with * -ERESTARTSYS. In order to catch any callbacks that @@ -869,7 +951,7 @@ static void __rpc_execute(struct rpc_task *task) * break the loop here, but go around once more. */ dprintk("RPC: %5u got signal\n", task->tk_pid); - task->tk_flags |= RPC_TASK_KILLED; + set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 7e55cfc69697..9faea12624a6 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -106,7 +106,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_ATOMIC); + *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(*ppage == NULL)) { if (copied == 0) copied = -ENOMEM; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d7117d241460..a9d40bc7ebed 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -73,6 +73,15 @@ static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); +static unsigned long xprt_request_timeout(const struct rpc_rqst *req) +{ + unsigned long timeout = jiffies + req->rq_timeout; + + if (time_before(timeout, req->rq_majortimeo)) + return timeout; + return req->rq_majortimeo; +} + /** * xprt_register_transport - register a transport implementation * @transport: transport to register @@ -209,9 +218,12 @@ out_unlock: out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->sending, task, NULL); + if (RPC_IS_SOFT(task)) + rpc_sleep_on_timeout(&xprt->sending, task, NULL, + xprt_request_timeout(req)); + else + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); @@ -273,9 +285,12 @@ out_unlock: xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->sending, task, NULL); + if (RPC_IS_SOFT(task)) + rpc_sleep_on_timeout(&xprt->sending, task, NULL, + xprt_request_timeout(req)); + else + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -554,53 +569,44 @@ bool xprt_write_space(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_write_space); -/** - * xprt_set_retrans_timeout_def - set a request's retransmit timeout - * @task: task whose timeout is to be set - * - * Set a request's retransmit timeout based on the transport's - * default timeout parameters. Used by transports that don't adjust - * the retransmit timeout based on round-trip time estimation. - */ -void xprt_set_retrans_timeout_def(struct rpc_task *task) +static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime) { - task->tk_timeout = task->tk_rqstp->rq_timeout; + s64 delta = ktime_to_ns(ktime_get() - abstime); + return likely(delta >= 0) ? + jiffies - nsecs_to_jiffies(delta) : + jiffies + nsecs_to_jiffies(-delta); } -EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); -/** - * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout - * @task: task whose timeout is to be set - * - * Set a request's retransmit timeout using the RTT estimator. - */ -void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req) { - int timer = task->tk_msg.rpc_proc->p_timer; - struct rpc_clnt *clnt = task->tk_client; - struct rpc_rtt *rtt = clnt->cl_rtt; - struct rpc_rqst *req = task->tk_rqstp; - unsigned long max_timeout = clnt->cl_timeout->to_maxval; + const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; + unsigned long majortimeo = req->rq_timeout; - task->tk_timeout = rpc_calc_rto(rtt, timer); - task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; - if (task->tk_timeout > max_timeout || task->tk_timeout == 0) - task->tk_timeout = max_timeout; + if (to->to_exponential) + majortimeo <<= to->to_retries; + else + majortimeo += to->to_increment * to->to_retries; + if (majortimeo > to->to_maxval || majortimeo == 0) + majortimeo = to->to_maxval; + return majortimeo; } -EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt); static void xprt_reset_majortimeo(struct rpc_rqst *req) { - const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; + req->rq_majortimeo += xprt_calc_majortimeo(req); +} - req->rq_majortimeo = req->rq_timeout; - if (to->to_exponential) - req->rq_majortimeo <<= to->to_retries; +static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req) +{ + unsigned long time_init; + struct rpc_xprt *xprt = req->rq_xprt; + + if (likely(xprt && xprt_connected(xprt))) + time_init = jiffies; else - req->rq_majortimeo += to->to_increment * to->to_retries; - if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0) - req->rq_majortimeo = to->to_maxval; - req->rq_majortimeo += jiffies; + time_init = xprt_abs_ktime_to_jiffies(task->tk_start); + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_majortimeo = time_init + xprt_calc_majortimeo(req); } /** @@ -822,9 +828,9 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; - rpc_sleep_on(&xprt->pending, task, NULL); + rpc_sleep_on_timeout(&xprt->pending, task, NULL, + xprt_request_timeout(task->tk_rqstp)); if (test_bit(XPRT_CLOSING, &xprt->state)) return; @@ -949,7 +955,7 @@ xprt_is_pinned_rqst(struct rpc_rqst *req) * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt receive lock. + * so should be holding xprt->queue_lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { @@ -961,7 +967,7 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt receive lock. + * Caller should be holding xprt->queue_lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { @@ -1017,7 +1023,6 @@ xprt_request_enqueue_receive(struct rpc_task *task) set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); spin_unlock(&xprt->queue_lock); - xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); } @@ -1103,6 +1108,49 @@ static void xprt_timer(struct rpc_task *task) } /** + * xprt_wait_for_reply_request_def - wait for reply + * @task: pointer to rpc_task + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation, + * and put the task to sleep on the pending queue. + */ +void xprt_wait_for_reply_request_def(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, + xprt_request_timeout(req)); +} +EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); + +/** + * xprt_wait_for_reply_request_rtt - wait for reply using RTT estimator + * @task: pointer to rpc_task + * + * Set a request's retransmit timeout using the RTT estimator, + * and put the task to sleep on the pending queue. + */ +void xprt_wait_for_reply_request_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rtt *rtt = clnt->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = clnt->cl_timeout->to_maxval; + unsigned long timeout; + + timeout = rpc_calc_rto(rtt, timer); + timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (timeout > max_timeout || timeout == 0) + timeout = max_timeout; + rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, + jiffies + timeout); +} +EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt); + +/** * xprt_request_wait_receive - wait for the reply to an RPC request * @task: RPC task about to send a request * @@ -1121,8 +1169,7 @@ void xprt_request_wait_receive(struct rpc_task *task) */ spin_lock(&xprt->queue_lock); if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); + xprt->ops->wait_for_reply_request(task); /* * Send an extra queue wakeup call if the * connection was dropped in case the call to @@ -1337,6 +1384,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) if (status < 0) goto out_dequeue; } + if (RPC_SIGNALLED(task)) { + status = -ERESTARTSYS; + goto out_dequeue; + } } /* @@ -1605,7 +1656,6 @@ xprt_request_init(struct rpc_task *task) struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; - req->rq_timeout = task->tk_client->cl_timeout->to_initval; req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; @@ -1618,7 +1668,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); + xprt_init_majortimeo(task, req); dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } @@ -1647,7 +1697,6 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) xprt_do_reserve(xprt, task); @@ -1670,7 +1719,6 @@ void xprt_retry_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - task->tk_timeout = 0; task->tk_status = -EAGAIN; xprt_do_reserve(xprt, task); } @@ -1827,7 +1875,9 @@ found: xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) - timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); + timer_setup(&xprt->timer, + xprt_init_autodisconnect, + TIMER_DEFERRABLE); else timer_setup(&xprt->timer, NULL, 0); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d79b18c1f4cd..ce986591f213 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -19,45 +19,6 @@ #undef RPCRDMA_BACKCHANNEL_DEBUG -static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, - unsigned int count) -{ - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_req *req; - struct rpc_rqst *rqst; - unsigned int i; - - for (i = 0; i < (count << 1); i++) { - struct rpcrdma_regbuf *rb; - size_t size; - - req = rpcrdma_create_req(r_xprt); - if (IS_ERR(req)) - return PTR_ERR(req); - rqst = &req->rl_slot; - - rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_bc_list); - __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock(&xprt->bc_pa_lock); - list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock(&xprt->bc_pa_lock); - - size = r_xprt->rx_data.inline_rsize; - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) - goto out_fail; - req->rl_sendbuf = rb; - xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, - min_t(size_t, size, PAGE_SIZE)); - } - return 0; - -out_fail: - rpcrdma_req_destroy(req); - return -ENOMEM; -} - /** * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests * @xprt: transport associated with these backchannel resources @@ -68,34 +29,10 @@ out_fail: int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; - /* The backchannel reply path returns each rpc_rqst to the - * bc_pa_list _after_ the reply is sent. If the server is - * faster than the client, it can send another backward - * direction request before the rpc_rqst is returned to the - * list. The client rejects the request in this case. - * - * Twice as many rpc_rqsts are prepared to ensure there is - * always an rpc_rqst available as soon as a reply is sent. - */ - if (reqs > RPCRDMA_BACKWARD_WRS >> 1) - goto out_err; - - rc = rpcrdma_bc_setup_reqs(r_xprt, reqs); - if (rc) - goto out_free; - - r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; + r_xprt->rx_buf.rb_bc_srv_max_requests = RPCRDMA_BACKWARD_WRS >> 1; trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; - -out_free: - xprt_rdma_bc_destroy(xprt, reqs); - -out_err: - pr_err("RPC: %s: setup backchannel transport failed\n", __func__); - return -ENOMEM; } /** @@ -107,10 +44,10 @@ out_err: size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -123,7 +60,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, - req->rl_rdmabuf->rg_base, rqst); + rdmab_data(req->rl_rdmabuf), rqst); p = xdr_reserve_space(&req->rl_stream, 28); if (unlikely(!p)) @@ -223,6 +160,43 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) spin_unlock(&xprt->bc_pa_lock); } +static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + size_t size; + + spin_lock(&xprt->bc_pa_lock); + rqst = list_first_entry_or_null(&xprt->bc_pa_list, struct rpc_rqst, + rq_bc_pa_list); + if (!rqst) + goto create_req; + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + return rqst; + +create_req: + spin_unlock(&xprt->bc_pa_lock); + + /* Set a limit to prevent a remote from overrunning our resources. + */ + if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) + return NULL; + + size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); + req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + if (!req) + return NULL; + + xprt->bc_alloc_count++; + rqst = &req->rl_slot; + rqst->rq_xprt = xprt; + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size); + return rqst; +} + /** * rpcrdma_bc_receive_call - Handle a backward direction call * @r_xprt: transport receiving the call @@ -254,18 +228,10 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, pr_info("RPC: %s: %*ph\n", __func__, size, p); #endif - /* Grab a free bc rqst */ - spin_lock(&xprt->bc_pa_lock); - if (list_empty(&xprt->bc_pa_list)) { - spin_unlock(&xprt->bc_pa_lock); + rqst = rpcrdma_bc_rqst_get(r_xprt); + if (!rqst) goto out_overflow; - } - rqst = list_first_entry(&xprt->bc_pa_list, - struct rpc_rqst, rq_bc_pa_list); - list_del(&rqst->rq_bc_pa_list); - spin_unlock(&xprt->bc_pa_lock); - /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; rqst->rq_xid = *p; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 52cb6c1b0c2b..794ba4ca0994 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -82,13 +82,13 @@ /** * frwr_is_supported - Check if device supports FRWR - * @ia: interface adapter to check + * @device: interface adapter to check * * Returns true if device supports FRWR, otherwise false */ -bool frwr_is_supported(struct rpcrdma_ia *ia) +bool frwr_is_supported(struct ib_device *device) { - struct ib_device_attr *attrs = &ia->ri_device->attrs; + struct ib_device_attr *attrs = &device->attrs; if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) goto out_not_supported; @@ -98,7 +98,7 @@ bool frwr_is_supported(struct rpcrdma_ia *ia) out_not_supported: pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", - ia->ri_device->name); + device->name); return false; } @@ -131,7 +131,7 @@ frwr_mr_recycle_worker(struct work_struct *work) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -194,12 +194,11 @@ out_list_err: * frwr_open - Prepare an endpoint for use with FRWR * @ia: interface adapter this endpoint will use * @ep: endpoint to prepare - * @cdata: transport parameters * * On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr - * cdata->max_requests + * ep->rep_max_requests * ia->ri_max_segs * * And these FRWR-related fields: @@ -208,10 +207,9 @@ out_list_err: * * On failure, a negative errno is returned. */ -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) { - struct ib_device_attr *attrs = &ia->ri_device->attrs; + struct ib_device_attr *attrs = &ia->ri_id->device->attrs; int max_qp_wr, depth, delta; ia->ri_mrtype = IB_MR_TYPE_MEM_REG; @@ -253,24 +251,23 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } while (delta > 0); } - max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr = ia->ri_id->device->attrs.max_qp_wr; max_qp_wr -= RPCRDMA_BACKWARD_WRS; max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (cdata->max_requests > max_qp_wr) - cdata->max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; + if (ep->rep_max_requests > max_qp_wr) + ep->rep_max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - cdata->max_requests = max_qp_wr / depth; - if (!cdata->max_requests) + ep->rep_max_requests = max_qp_wr / depth; + if (!ep->rep_max_requests) return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; + ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; } ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ @@ -300,15 +297,6 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); } -static void -__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) -{ - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: %s: %s (%u/0x%x)\n", - wr, ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); -} - /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC * @cq: completion queue (ignored) @@ -323,10 +311,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_FR; - __frwr_sendcompletion_flush(wc, "fastreg"); - } trace_xprtrdma_wc_fastreg(wc, frwr); } @@ -344,10 +330,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_LI; - __frwr_sendcompletion_flush(wc, "localinv"); - } trace_xprtrdma_wc_li(wc, frwr); } @@ -366,12 +350,10 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - if (wc->status != IB_WC_SUCCESS) { + if (wc->status != IB_WC_SUCCESS) frwr->fr_state = FRWR_FLUSHED_LI; - __frwr_sendcompletion_flush(wc, "localinv"); - } - complete(&frwr->fr_linv_done); trace_xprtrdma_wc_li_wake(wc, frwr); + complete(&frwr->fr_linv_done); } /** @@ -436,7 +418,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, } mr->mr_dir = rpcrdma_data_dir(writing); - mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + mr->mr_nents = + ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; @@ -466,7 +449,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - frwr->fr_state = FRWR_IS_INVALID; + mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); rpcrdma_mr_put(mr); return ERR_PTR(-EIO); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 6c1fb270f127..85115a2e2639 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -105,16 +105,23 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) return size; } +/** + * rpcrdma_set_max_header_sizes - Initialize inline payload sizes + * @r_xprt: transport instance to initialize + * + * The max_inline fields contain the maximum size of an RPC message + * so the marshaling code doesn't have to repeat this calculation + * for every RPC. + */ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - unsigned int maxsegs = ia->ri_max_segs; - - ia->ri_max_inline_write = cdata->inline_wsize - - rpcrdma_max_call_header_size(maxsegs); - ia->ri_max_inline_read = cdata->inline_rsize - - rpcrdma_max_reply_header_size(maxsegs); + unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + ep->rep_max_inline_send = + ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->rep_max_inline_recv = + ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -131,7 +138,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdr = &rqst->rq_snd_buf; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) return false; if (xdr->page_len) { @@ -159,9 +166,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -173,10 +178,9 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct rpc_rqst *rqst) { const struct xdr_buf *buf = &rqst->rq_rcv_buf; - const struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return buf->head[0].iov_len + buf->tail[0].iov_len < - ia->ri_max_inline_read; + return (buf->head[0].iov_len + buf->tail[0].iov_len) < + r_xprt->rx_ep.rep_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -238,7 +242,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, */ if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { if (!*ppages) - *ppages = alloc_page(GFP_ATOMIC); + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); if (!*ppages) return -ENOBUFS; } @@ -508,50 +512,45 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } /** - * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * rpcrdma_sendctx_unmap - DMA-unmap Send buffer * @sc: sendctx containing SGEs to unmap * */ -void -rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) { - struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; struct ib_sge *sge; - unsigned int count; /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. */ - sge = &sc->sc_sges[2]; - for (count = sc->sc_unmap_count; count; ++sge, --count) - ib_dma_unmap_page(ia->ri_device, - sge->addr, sge->length, DMA_TO_DEVICE); + for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; + ++sge, --sc->sc_unmap_count) + ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length, + DMA_TO_DEVICE); - if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { - smp_mb__after_atomic(); + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, + &sc->sc_req->rl_flags)) wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); - } } /* Prepare an SGE for the RPC-over-RDMA transport header. */ -static bool -rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - u32 len) +static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 len) { struct rpcrdma_sendctx *sc = req->rl_sendctx; struct rpcrdma_regbuf *rb = req->rl_rdmabuf; struct ib_sge *sge = sc->sc_sges; - if (!rpcrdma_dma_map_regbuf(ia, rb)) + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; sge->addr = rdmab_addr(rb); sge->length = len; sge->lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, - sge->length, DMA_TO_DEVICE); + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, + DMA_TO_DEVICE); sc->sc_wr.num_sge++; return true; @@ -563,23 +562,23 @@ out_regbuf: /* Prepare the Send SGEs. The head and tail iovec, and each entry * in the page list, gets its own SGE. */ -static bool -rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, - struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype) { struct rpcrdma_sendctx *sc = req->rl_sendctx; unsigned int sge_no, page_base, len, remaining; struct rpcrdma_regbuf *rb = req->rl_sendbuf; - struct ib_device *device = ia->ri_device; struct ib_sge *sge = sc->sc_sges; - u32 lkey = ia->ri_pd->local_dma_lkey; struct page *page, **ppages; /* The head iovec is straightforward, as it is already * DMA-mapped. Sync the content that has changed. */ - if (!rpcrdma_dma_map_regbuf(ia, rb)) + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) goto out_regbuf; + sc->sc_device = rdmab_device(rb); sge_no = 1; sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; @@ -626,13 +625,14 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, goto out_mapping_overflow; len = min_t(u32, PAGE_SIZE - page_base, remaining); - sge[sge_no].addr = ib_dma_map_page(device, *ppages, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) + sge[sge_no].addr = + ib_dma_map_page(rdmab_device(rb), *ppages, + page_base, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), + sge[sge_no].addr)) goto out_mapping_err; sge[sge_no].length = len; - sge[sge_no].lkey = lkey; + sge[sge_no].lkey = rdmab_lkey(rb); sc->sc_unmap_count++; ppages++; @@ -653,13 +653,13 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, map_tail: sge_no++; - sge[sge_no].addr = ib_dma_map_page(device, page, - page_base, len, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(device, sge[sge_no].addr)) + sge[sge_no].addr = + ib_dma_map_page(rdmab_device(rb), page, page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr)) goto out_mapping_err; sge[sge_no].length = len; - sge[sge_no].lkey = lkey; + sge[sge_no].lkey = rdmab_lkey(rb); sc->sc_unmap_count++; } @@ -674,12 +674,12 @@ out_regbuf: return false; out_mapping_overflow: - rpcrdma_unmap_sendctx(sc); + rpcrdma_sendctx_unmap(sc); pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); return false; out_mapping_err: - rpcrdma_unmap_sendctx(sc); + rpcrdma_sendctx_unmap(sc); trace_xprtrdma_dma_maperr(sge[sge_no].addr); return false; } @@ -699,7 +699,7 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { - req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); if (!req->rl_sendctx) return -EAGAIN; req->rl_sendctx->sc_wr.num_sge = 0; @@ -707,11 +707,11 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, req->rl_sendctx->sc_req = req; __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); - if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen)) return -EIO; if (rtype != rpcrdma_areadch) - if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype)) return -EIO; return 0; @@ -747,8 +747,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) int ret; rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); - xdr_init_encode(xdr, &req->rl_hdrbuf, - req->rl_rdmabuf->rg_base, rqst); + xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), + rqst); /* Fixed header fields */ ret = -EMSGSIZE; @@ -876,6 +876,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: + trace_xprtrdma_marshal_failed(rqst, ret); switch (ret) { case -EAGAIN: xprt_wait_for_buffer_space(rqst->rq_xprt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 907464c2a9f0..bed57d8b5c19 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -261,7 +261,7 @@ static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .buf_alloc = xprt_rdma_bc_allocate, .buf_free = xprt_rdma_bc_free, .send_request = xprt_rdma_bc_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xprt_rdma_bc_close, .destroy = xprt_rdma_bc_put, .print_stats = xprt_rdma_print_stats diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 5d261353bd90..1f73a6a7e43c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -68,9 +68,9 @@ * tunables */ -static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; -static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; @@ -288,7 +288,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_destroy(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -311,10 +311,8 @@ static const struct rpc_timeout xprt_rdma_default_timeout = { static struct rpc_xprt * xprt_setup_rdma(struct xprt_create *args) { - struct rpcrdma_create_data_internal cdata; struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; - struct rpcrdma_ep *new_ep; struct sockaddr *sap; int rc; @@ -349,40 +347,12 @@ xprt_setup_rdma(struct xprt_create *args) xprt_set_bound(xprt); xprt_rdma_format_addresses(xprt, sap); - cdata.max_requests = xprt_rdma_slot_table_entries; - - cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ - cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ - - cdata.inline_wsize = xprt_rdma_max_inline_write; - if (cdata.inline_wsize > cdata.wsize) - cdata.inline_wsize = cdata.wsize; - - cdata.inline_rsize = xprt_rdma_max_inline_read; - if (cdata.inline_rsize > cdata.rsize) - cdata.inline_rsize = cdata.rsize; - - /* - * Create new transport instance, which includes initialized - * o ia - * o endpoint - * o buffers - */ - new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); if (rc) goto out1; - /* - * initialize and create ep - */ - new_xprt->rx_data = cdata; - new_ep = &new_xprt->rx_ep; - - rc = rpcrdma_ep_create(&new_xprt->rx_ep, - &new_xprt->rx_ia, &new_xprt->rx_data); + rc = rpcrdma_ep_create(new_xprt); if (rc) goto out2; @@ -413,7 +383,7 @@ out4: rpcrdma_buffer_destroy(&new_xprt->rx_buf); rc = -ENODEV; out3: - rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + rpcrdma_ep_destroy(new_xprt); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: @@ -585,52 +555,15 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) rpc_wake_up_next(&xprt->backlog); } -static bool -rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - size_t size, gfp_t flags) +static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb, size_t size, + gfp_t flags) { - struct rpcrdma_regbuf *rb; - - if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) - return false; - - rpcrdma_free_regbuf(req->rl_sendbuf); - r_xprt->rx_stats.hardway_register_count += size; - req->rl_sendbuf = rb; - return true; -} - -/* The rq_rcv_buf is used only if a Reply chunk is necessary. - * The decision to use a Reply chunk is made later in - * rpcrdma_marshal_req. This buffer is registered at that time. - * - * Otherwise, the associated RPC Reply arrives in a separate - * Receive buffer, arbitrarily chosen by the HCA. The buffer - * allocated here for the RPC Reply is not utilized in that - * case. See rpcrdma_inline_fixup. - * - * A regbuf is used here to remember the buffer size. - */ -static bool -rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - size_t size, gfp_t flags) -{ - struct rpcrdma_regbuf *rb; - - if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); - if (IS_ERR(rb)) - return false; - - rpcrdma_free_regbuf(req->rl_recvbuf); - r_xprt->rx_stats.hardway_register_count += size; - req->rl_recvbuf = rb; + if (unlikely(rdmab_length(rb) < size)) { + if (!rpcrdma_regbuf_realloc(rb, size, flags)) + return false; + r_xprt->rx_stats.hardway_register_count += size; + } return true; } @@ -655,13 +588,15 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, + flags)) goto out_fail; - if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) + if (!rpcrdma_check_regbuf(r_xprt, req->rl_recvbuf, rqst->rq_rcvsize, + flags)) goto out_fail; - rqst->rq_buffer = req->rl_sendbuf->rg_base; - rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + rqst->rq_buffer = rdmab_data(req->rl_sendbuf); + rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf); trace_xprtrdma_op_allocate(task, req); return 0; @@ -815,7 +750,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_rdma_alloc_slot, .free_slot = xprt_rdma_free_slot, .release_request = xprt_release_rqst_cong, /* ditto */ - .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .wait_for_reply_request = xprt_wait_for_reply_request_def, /* ditto */ .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 89a63391d4d4..bef5eac8ab38 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -76,11 +76,16 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); -static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); -static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, + gfp_t flags); +static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); +static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); -/* Wait for outstanding transport work to finish. +/* Wait for outstanding transport work to finish. ib_drain_qp + * handles the drains in the wrong order for us, so open code + * them here. */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { @@ -90,7 +95,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_qp(ia->ri_id->qp); + ib_drain_rq(ia->ri_id->qp); drain_workqueue(buf->rb_completion_wq); /* Deferred Reply processing might have scheduled @@ -132,11 +137,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); - if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: Send: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - rpcrdma_sendctx_put_locked(sc); } @@ -174,10 +174,6 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); rpcrdma_recv_buffer_put(rep); } @@ -185,7 +181,6 @@ static void rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, struct rdma_conn_param *param) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; const struct rpcrdma_connect_private *pmsg = param->private_data; unsigned int rsize, wsize; @@ -202,12 +197,13 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < cdata->inline_rsize) - cdata->inline_rsize = rsize; - if (wsize < cdata->inline_wsize) - cdata->inline_wsize = wsize; - dprintk("RPC: %s: max send %u, max recv %u\n", - __func__, cdata->inline_wsize, cdata->inline_rsize); + if (rsize < r_xprt->rx_ep.rep_inline_recv) + r_xprt->rx_ep.rep_inline_recv = rsize; + if (wsize < r_xprt->rx_ep.rep_inline_send) + r_xprt->rx_ep.rep_inline_send = wsize; + dprintk("RPC: %s: max send %u, max recv %u\n", __func__, + r_xprt->rx_ep.rep_inline_send, + r_xprt->rx_ep.rep_inline_recv); rpcrdma_set_max_header_sizes(r_xprt); } @@ -247,7 +243,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_device->name, + ia->ri_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); @@ -256,7 +252,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; - ia->ri_device = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: @@ -291,7 +286,7 @@ disconnected: dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_device->name, rdma_event_msg(event->event)); + ia->ri_id->device->name, rdma_event_msg(event->event)); return 0; } @@ -370,9 +365,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) rc = PTR_ERR(ia->ri_id); goto out_err; } - ia->ri_device = ia->ri_id->device; - ia->ri_pd = ib_alloc_pd(ia->ri_device, 0); + ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); @@ -381,12 +375,12 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRWR: - if (frwr_is_supported(ia)) + if (frwr_is_supported(ia->ri_id->device)) break; /*FALLTHROUGH*/ default: pr_err("rpcrdma: Device %s does not support memreg mode %d\n", - ia->ri_device->name, xprt_rdma_memreg_strategy); + ia->ri_id->device->name, xprt_rdma_memreg_strategy); rc = -EINVAL; goto out_err; } @@ -438,11 +432,11 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) * mappings and MRs are gone. */ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { - rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); - rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); - rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); + rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); + rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); } rpcrdma_mrs_destroy(buf); ib_dealloc_pd(ia->ri_pd); @@ -468,7 +462,6 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) rdma_destroy_id(ia->ri_id); } ia->ri_id = NULL; - ia->ri_device = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) @@ -476,19 +469,26 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) ia->ri_pd = NULL; } -/* - * Create unconnected endpoint. +/** + * rpcrdma_ep_create - Create unconnected endpoint + * @r_xprt: transport to instantiate + * + * Returns zero on success, or a negative errno. */ -int -rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata) +int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; struct ib_cq *sendcq, *recvcq; unsigned int max_sge; int rc; - max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge, + ep->rep_max_requests = xprt_rdma_slot_table_entries; + ep->rep_inline_send = xprt_rdma_max_inline_write; + ep->rep_inline_recv = xprt_rdma_max_inline_read; + + max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge, RPCRDMA_MAX_SEND_SGES); if (max_sge < RPCRDMA_MIN_SEND_SGES) { pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); @@ -496,7 +496,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ia->ri_max_send_sges = max_sge; - rc = frwr_open(ia, ep, cdata); + rc = frwr_open(ia, ep); if (rc) return rc; @@ -518,23 +518,21 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_send_sge, ep->rep_attr.cap.max_recv_sge); - /* set trigger for requesting send completion */ - ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, - cdata->max_requests >> 2); + ep->rep_send_batch = ep->rep_max_requests >> 3; ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; - sendcq = ib_alloc_cq(ia->ri_device, NULL, + sendcq = ib_alloc_cq(ia->ri_id->device, NULL, ep->rep_attr.cap.max_send_wr + 1, - ia->ri_device->num_comp_vectors > 1 ? 1 : 0, + ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); goto out1; } - recvcq = ib_alloc_cq(ia->ri_device, NULL, + recvcq = ib_alloc_cq(ia->ri_id->device, NULL, ep->rep_attr.cap.max_recv_wr + 1, 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { @@ -552,15 +550,15 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); ep->rep_remote_cma.private_data = pmsg; ep->rep_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); + min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -583,16 +581,16 @@ out1: return rc; } -/* - * rpcrdma_ep_destroy +/** + * rpcrdma_ep_destroy - Disconnect and destroy endpoint. + * @r_xprt: transport instance to shut down * - * Disconnect and destroy endpoint. After this, the only - * valid operations on the ep are to free it (if dynamically - * allocated) or re-create it. */ -void -rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); @@ -622,7 +620,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, goto out1; rc = -ENOMEM; - err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + err = rpcrdma_ep_create(r_xprt); if (err) { pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); goto out2; @@ -639,7 +637,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, return 0; out3: - rpcrdma_ep_destroy(ep, ia); + rpcrdma_ep_destroy(r_xprt); out2: rpcrdma_ia_close(ia); out1: @@ -672,7 +670,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, */ old = id; rc = -ENETUNREACH; - if (ia->ri_device != id->device) { + if (ia->ri_id->device != id->device) { pr_err("rpcrdma: can't reconnect on different device!\n"); goto out_destroy; } @@ -796,8 +794,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) */ /* rpcrdma_sendctxs_destroy() assumes caller has already quiesced - * queue activity, and ib_drain_qp has flushed all remaining Send - * requests. + * queue activity, and rpcrdma_xprt_drain has flushed all remaining + * Send requests. */ static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) { @@ -867,20 +865,20 @@ static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, /** * rpcrdma_sendctx_get_locked - Acquire a send context - * @buf: transport buffers from which to acquire an unused context + * @r_xprt: controlling transport instance * * Returns pointer to a free send completion context; or NULL if * the queue is empty. * * Usage: Called to acquire an SGE array before preparing a Send WR. * - * The caller serializes calls to this function (per rpcrdma_buffer), - * and provides an effective memory barrier that flushes the new value + * The caller serializes calls to this function (per transport), and + * provides an effective memory barrier that flushes the new value * of rb_sc_head. */ -struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_sendctx *sc; unsigned long next_head; @@ -905,7 +903,6 @@ out_emptyq: * backing up. Cause the caller to pause and try again. */ set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags); - r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); r_xprt->rx_stats.empty_sendctx_q++; return NULL; } @@ -917,7 +914,7 @@ out_emptyq: * Usage: Called from Send completion to return a sendctxt * to the queue. * - * The caller serializes calls to this function (per rpcrdma_buffer). + * The caller serializes calls to this function (per transport). */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) @@ -925,7 +922,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; unsigned long next_tail; - /* Unmap SGEs of previously completed by unsignaled + /* Unmap SGEs of previously completed but unsignaled * Sends by walking up the queue until @sc is found. */ next_tail = buf->rb_sc_tail; @@ -933,7 +930,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) next_tail = rpcrdma_sendctx_next(buf, next_tail); /* ORDER: item must be accessed _before_ tail is updated */ - rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + rpcrdma_sendctx_unmap(buf->rb_sc_ctxs[next_tail]); } while (buf->rb_sc_ctxs[next_tail] != sc); @@ -996,54 +993,70 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) rpcrdma_mrs_create(r_xprt); } -struct rpcrdma_req * -rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_req_create - Allocate an rpcrdma_req object + * @r_xprt: controlling r_xprt + * @size: initial size, in bytes, of send and receive buffers + * @flags: GFP flags passed to memory allocators + * + * Returns an allocated and fully initialized rpcrdma_req or NULL. + */ +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, + gfp_t flags) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), GFP_KERNEL); + req = kzalloc(sizeof(*req), flags); if (req == NULL) - return ERR_PTR(-ENOMEM); + goto out1; - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, - DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) { - kfree(req); - return ERR_PTR(-ENOMEM); - } + rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); + if (!rb) + goto out2; req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); + + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); + if (!req->rl_sendbuf) + goto out3; + + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); + if (!req->rl_recvbuf) + goto out4; + req->rl_buffer = buffer; INIT_LIST_HEAD(&req->rl_registered); - spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_lock); return req; + +out4: + kfree(req->rl_sendbuf); +out3: + kfree(req->rl_rdmabuf); +out2: + kfree(req); +out1: + return NULL; } -static int -rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) +static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - int rc; - rc = -ENOMEM; rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); - if (IS_ERR(rep->rr_rdmabuf)) { - rc = PTR_ERR(rep->rr_rdmabuf); + if (!rep->rr_rdmabuf) goto out_free; - } - xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1058,22 +1071,27 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) spin_lock(&buf->rb_lock); list_add(&rep->rr_list, &buf->rb_recv_bufs); spin_unlock(&buf->rb_lock); - return 0; + return true; out_free: kfree(rep); out: - return rc; + return false; } -int -rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) +/** + * rpcrdma_buffer_create - Create initial set of req/rep objects + * @r_xprt: transport instance to (re)initialize + * + * Returns zero on success, otherwise a negative errno. + */ +int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; buf->rb_flags = 0; - buf->rb_max_requests = r_xprt->rx_data.max_requests; + buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); @@ -1086,16 +1104,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); + + rc = -ENOMEM; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - req = rpcrdma_create_req(r_xprt); - if (IS_ERR(req)) { - dprintk("RPC: %s: request buffer %d alloc" - " failed\n", __func__, i); - rc = PTR_ERR(req); + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, + GFP_KERNEL); + if (!req) goto out; - } list_add(&req->rl_list, &buf->rb_send_bufs); } @@ -1121,10 +1138,9 @@ out: return rc; } -static void -rpcrdma_destroy_rep(struct rpcrdma_rep *rep) +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) { - rpcrdma_free_regbuf(rep->rr_rdmabuf); + rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } @@ -1140,9 +1156,9 @@ rpcrdma_req_destroy(struct rpcrdma_req *req) { list_del(&req->rl_all); - rpcrdma_free_regbuf(req->rl_recvbuf); - rpcrdma_free_regbuf(req->rl_sendbuf); - rpcrdma_free_regbuf(req->rl_rdmabuf); + rpcrdma_regbuf_free(req->rl_recvbuf); + rpcrdma_regbuf_free(req->rl_sendbuf); + rpcrdma_regbuf_free(req->rl_rdmabuf); kfree(req); } @@ -1180,7 +1196,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) * rpcrdma_buffer_destroy - Release all hw resources * @buf: root control block for resources * - * ORDERING: relies on a prior ib_drain_qp : + * ORDERING: relies on a prior rpcrdma_xprt_drain : * - No more Send or Receive completions can occur * - All MRs, reps, and reqs are returned to their free lists */ @@ -1202,7 +1218,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rep = list_first_entry(&buf->rb_recv_bufs, struct rpcrdma_rep, rr_list); list_del(&rep->rr_list); - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } while (!list_empty(&buf->rb_send_bufs)) { @@ -1281,7 +1297,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1331,7 +1347,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) } spin_unlock(&buffers->rb_lock); if (rep) - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } /* @@ -1348,69 +1364,90 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) list_add(&rep->rr_list, &buffers->rb_recv_bufs); spin_unlock(&buffers->rb_lock); } else { - rpcrdma_destroy_rep(rep); + rpcrdma_rep_destroy(rep); } } -/** - * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers - * @size: size of buffer to be allocated, in bytes - * @direction: direction of data movement - * @flags: GFP flags - * - * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that - * can be persistently DMA-mapped for I/O. +/* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls * or Replies they may be registered externally via frwr_map. */ -struct rpcrdma_regbuf * -rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, +static struct rpcrdma_regbuf * +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb) + size, flags); - if (rb == NULL) - return ERR_PTR(-ENOMEM); + rb = kmalloc(sizeof(*rb), flags); + if (!rb) + return NULL; + rb->rg_data = kmalloc(size, flags); + if (!rb->rg_data) { + kfree(rb); + return NULL; + } rb->rg_device = NULL; rb->rg_direction = direction; rb->rg_iov.length = size; - return rb; } /** - * __rpcrdma_map_regbuf - DMA-map a regbuf - * @ia: controlling rpcrdma_ia + * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer + * @rb: regbuf to reallocate + * @size: size of buffer to be allocated, in bytes + * @flags: GFP flags + * + * Returns true if reallocation was successful. If false is + * returned, @rb is left untouched. + */ +bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) +{ + void *buf; + + buf = kmalloc(size, flags); + if (!buf) + return false; + + rpcrdma_regbuf_dma_unmap(rb); + kfree(rb->rg_data); + + rb->rg_data = buf; + rb->rg_iov.length = size; + return true; +} + +/** + * __rpcrdma_regbuf_dma_map - DMA-map a regbuf + * @r_xprt: controlling transport instance * @rb: regbuf to be mapped + * + * Returns true if the buffer is now DMA mapped to @r_xprt's device */ -bool -__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb) { - struct ib_device *device = ia->ri_device; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; if (rb->rg_direction == DMA_NONE) return false; - rb->rg_iov.addr = ib_dma_map_single(device, - (void *)rb->rg_base, - rdmab_length(rb), - rb->rg_direction); + rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb), + rdmab_length(rb), rb->rg_direction); if (ib_dma_mapping_error(device, rdmab_addr(rb))) { trace_xprtrdma_dma_maperr(rdmab_addr(rb)); return false; } rb->rg_device = device; - rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; return true; } -static void -rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) +static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb) { if (!rb) return; @@ -1418,19 +1455,16 @@ rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) if (!rpcrdma_regbuf_is_mapped(rb)) return; - ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), - rdmab_length(rb), rb->rg_direction); + ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), rdmab_length(rb), + rb->rg_direction); rb->rg_device = NULL; } -/** - * rpcrdma_free_regbuf - deregister and free registered buffer - * @rb: regbuf to be deregistered and freed - */ -void -rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) +static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) { - rpcrdma_dma_unmap_regbuf(rb); + rpcrdma_regbuf_dma_unmap(rb); + if (rb) + kfree(rb->rg_data); kfree(rb); } @@ -1497,17 +1531,15 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) list_del(&rep->rr_list); spin_unlock(&buf->rb_lock); if (!rep) { - if (rpcrdma_create_rep(r_xprt, temp)) + if (!rpcrdma_rep_create(r_xprt, temp)) break; continue; } rb = rep->rr_rdmabuf; - if (!rpcrdma_regbuf_is_mapped(rb)) { - if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) { - rpcrdma_recv_buffer_put(rep); - break; - } + if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) { + rpcrdma_recv_buffer_put(rep); + break; } trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 10f6593e1a6a..d1e0749bcbc4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -66,20 +66,17 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { - struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct completion ri_done; - struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frwr_depth; - unsigned int ri_max_inline_write; - unsigned int ri_max_inline_read; unsigned int ri_max_send_sges; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; + struct completion ri_done; + struct completion ri_remove_done; }; enum { @@ -93,22 +90,29 @@ enum { struct rpcrdma_ep { unsigned int rep_send_count; unsigned int rep_send_batch; + unsigned int rep_max_inline_send; + unsigned int rep_max_inline_recv; int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; + unsigned int rep_max_requests; /* set by /proc */ + unsigned int rep_inline_send; /* negotiated */ + unsigned int rep_inline_recv; /* negotiated */ int rep_receive_count; }; /* Pre-allocate extra Work Requests for handling backward receives * and sends. This is a fixed value because the Work Queues are - * allocated when the forward channel is set up. + * allocated when the forward channel is set up, long before the + * backchannel is provisioned. This value is two times + * NFS4_DEF_CB_SLOT_TABLE_SIZE. */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) -#define RPCRDMA_BACKWARD_WRS (8) +#define RPCRDMA_BACKWARD_WRS (32) #else -#define RPCRDMA_BACKWARD_WRS (0) +#define RPCRDMA_BACKWARD_WRS (0) #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV @@ -121,33 +125,34 @@ struct rpcrdma_regbuf { struct ib_sge rg_iov; struct ib_device *rg_device; enum dma_data_direction rg_direction; - __be32 rg_base[0] __attribute__ ((aligned(256))); + void *rg_data; }; -static inline u64 -rdmab_addr(struct rpcrdma_regbuf *rb) +static inline u64 rdmab_addr(struct rpcrdma_regbuf *rb) { return rb->rg_iov.addr; } -static inline u32 -rdmab_length(struct rpcrdma_regbuf *rb) +static inline u32 rdmab_length(struct rpcrdma_regbuf *rb) { return rb->rg_iov.length; } -static inline u32 -rdmab_lkey(struct rpcrdma_regbuf *rb) +static inline u32 rdmab_lkey(struct rpcrdma_regbuf *rb) { return rb->rg_iov.lkey; } -static inline struct ib_device * -rdmab_device(struct rpcrdma_regbuf *rb) +static inline struct ib_device *rdmab_device(struct rpcrdma_regbuf *rb) { return rb->rg_device; } +static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) +{ + return rb->rg_data; +} + #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) /* To ensure a transport can always make forward progress, @@ -222,34 +227,18 @@ struct rpcrdma_xprt; struct rpcrdma_sendctx { struct ib_send_wr sc_wr; struct ib_cqe sc_cqe; + struct ib_device *sc_device; struct rpcrdma_xprt *sc_xprt; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; }; -/* Limit the number of SGEs that can be unmapped during one - * Send completion. This caps the amount of work a single - * completion can do before returning to the provider. - * - * Setting this to zero disables Send completion batching. - */ -enum { - RPCRDMA_MAX_SEND_BATCH = 7, -}; - /* * struct rpcrdma_mr - external memory region metadata * * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). - * - * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During - * call_allocate, rpcrdma_buffer_get() assigns one to each segment in - * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep - * track of registration metadata while each RPC is pending. - * rpcrdma_deregister_external() uses this metadata to unmap and - * release these resources when an RPC is complete. */ enum rpcrdma_frwr_state { FRWR_IS_INVALID, /* ready to be used */ @@ -419,20 +408,6 @@ enum { }; /* - * Internal structure for transport instance creation. This - * exists primarily for modularity. - * - * This data should be set with mount options - */ -struct rpcrdma_create_data_internal { - unsigned int max_requests; /* max requests (slots) in flight */ - unsigned int rsize; /* mount rsize - max read hdr+data */ - unsigned int wsize; /* mount wsize - max write hdr+data */ - unsigned int inline_rsize; /* max non-rdma read data payload */ - unsigned int inline_wsize; /* max non-rdma write data payload */ -}; - -/* * Statistics for RPCRDMA */ struct rpcrdma_stats { @@ -476,13 +451,11 @@ struct rpcrdma_xprt { struct rpcrdma_ia rx_ia; struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; - struct rpcrdma_create_data_internal rx_data; struct delayed_work rx_connect_worker; struct rpcrdma_stats rx_stats; }; #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) -#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) static inline const char * rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) @@ -516,9 +489,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *); -void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); +void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); @@ -528,11 +500,12 @@ int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, + gfp_t flags); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); -struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); @@ -548,23 +521,34 @@ struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, - gfp_t); -bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); -void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); +bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, + gfp_t flags); +bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb); -static inline bool -rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) +/** + * rpcrdma_regbuf_is_mapped - check if buffer is DMA mapped + * + * Returns true if the buffer is now mapped to rb->rg_device. + */ +static inline bool rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) { return rb->rg_device != NULL; } -static inline bool -rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +/** + * rpcrdma_regbuf_dma_map - DMA-map a regbuf + * @r_xprt: controlling transport instance + * @rb: regbuf to be mapped + * + * Returns true if the buffer is currently DMA mapped. + */ +static inline bool rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb) { if (likely(rpcrdma_regbuf_is_mapped(rb))) return true; - return __rpcrdma_dma_map_regbuf(ia, rb); + return __rpcrdma_regbuf_dma_map(r_xprt, rb); } /* @@ -579,9 +563,8 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ -bool frwr_is_supported(struct rpcrdma_ia *); -int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata); +bool frwr_is_supported(struct ib_device *device); +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); @@ -610,7 +593,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, u32 hdrlen, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype); -void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); @@ -627,7 +610,9 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_slot_table_entries; extern unsigned int xprt_rdma_max_inline_read; +extern unsigned int xprt_rdma_max_inline_write; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); void xprt_rdma_close(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 732d4b57411a..c69951ed2ebc 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2017,6 +2017,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) * we'll need to figure out how to pass a namespace to * connect. */ + task->tk_rpc_status = -ENOTCONN; rpc_exit(task, -ENOTCONN); return; } @@ -2690,7 +2691,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_close, .destroy = xs_destroy, .print_stats = xs_local_print_stats, @@ -2710,7 +2711,7 @@ static const struct rpc_xprt_ops xs_udp_ops = { .buf_alloc = rpc_malloc, .buf_free = rpc_free, .send_request = xs_udp_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .wait_for_reply_request = xprt_wait_for_reply_request_rtt, .timer = xs_udp_timer, .release_request = xprt_release_rqst_cong, .close = xs_close, @@ -2733,7 +2734,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, .set_connect_timeout = xs_tcp_set_connect_timeout, @@ -2761,7 +2762,7 @@ static const struct rpc_xprt_ops bc_tcp_ops = { .buf_alloc = bc_malloc, .buf_free = bc_free, .send_request = bc_send_request, - .set_retrans_timeout = xprt_set_retrans_timeout_def, + .wait_for_reply_request = xprt_wait_for_reply_request_def, .close = bc_close, .destroy = bc_destroy, .print_stats = xs_tcp_print_stats, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index d8026543bf4c..6c997d4a6218 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -54,7 +54,9 @@ const char tipc_bclink_name[] = "broadcast-link"; * @dests: array keeping number of reachable destinations per bearer * @primary_bearer: a bearer having links to all broadcast destinations, if any * @bcast_support: indicates if primary bearer, if any, supports broadcast + * @force_bcast: forces broadcast for multicast traffic * @rcast_support: indicates if all peer nodes support replicast + * @force_rcast: forces replicast for multicast traffic * @rc_ratio: dest count as percentage of cluster size where send method changes * @bc_threshold: calculated from rc_ratio; if dests > threshold use broadcast */ @@ -64,7 +66,9 @@ struct tipc_bc_base { int dests[MAX_BEARERS]; int primary_bearer; bool bcast_support; + bool force_bcast; bool rcast_support; + bool force_rcast; int rc_ratio; int bc_threshold; }; @@ -216,9 +220,24 @@ static void tipc_bcast_select_xmit_method(struct net *net, int dests, } /* Can current method be changed ? */ method->expires = jiffies + TIPC_METHOD_EXPIRE; - if (method->mandatory || time_before(jiffies, exp)) + if (method->mandatory) return; + if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL) && + time_before(jiffies, exp)) + return; + + /* Configuration as force 'broadcast' method */ + if (bb->force_bcast) { + method->rcast = false; + return; + } + /* Configuration as force 'replicast' method */ + if (bb->force_rcast) { + method->rcast = true; + return; + } + /* Configuration as 'autoselect' or default method */ /* Determine method to use now */ method->rcast = dests <= bb->bc_threshold; } @@ -281,6 +300,63 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, return 0; } +/* tipc_mcast_send_sync - deliver a dummy message with SYN bit + * @net: the applicable net namespace + * @skb: socket buffer to copy + * @method: send method to be used + * @dests: destination nodes for message. + * @cong_link_cnt: returns number of encountered congested destination links + * Returns 0 if success, otherwise errno + */ +static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, + struct tipc_mc_method *method, + struct tipc_nlist *dests, + u16 *cong_link_cnt) +{ + struct tipc_msg *hdr, *_hdr; + struct sk_buff_head tmpq; + struct sk_buff *_skb; + + /* Is a cluster supporting with new capabilities ? */ + if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) + return 0; + + hdr = buf_msg(skb); + if (msg_user(hdr) == MSG_FRAGMENTER) + hdr = msg_get_wrapped(hdr); + if (msg_type(hdr) != TIPC_MCAST_MSG) + return 0; + + /* Allocate dummy message */ + _skb = tipc_buf_acquire(MCAST_H_SIZE, GFP_KERNEL); + if (!_skb) + return -ENOMEM; + + /* Preparing for 'synching' header */ + msg_set_syn(hdr, 1); + + /* Copy skb's header into a dummy header */ + skb_copy_to_linear_data(_skb, hdr, MCAST_H_SIZE); + skb_orphan(_skb); + + /* Reverse method for dummy message */ + _hdr = buf_msg(_skb); + msg_set_size(_hdr, MCAST_H_SIZE); + msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); + + skb_queue_head_init(&tmpq); + __skb_queue_tail(&tmpq, _skb); + if (method->rcast) + tipc_bcast_xmit(net, &tmpq, cong_link_cnt); + else + tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt); + + /* This queue should normally be empty by now */ + __skb_queue_purge(&tmpq); + + return 0; +} + /* tipc_mcast_xmit - deliver message to indicated destination nodes * and to identified node local sockets * @net: the applicable net namespace @@ -296,6 +372,9 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, u16 *cong_link_cnt) { struct sk_buff_head inputq, localq; + bool rcast = method->rcast; + struct tipc_msg *hdr; + struct sk_buff *skb; int rc = 0; skb_queue_head_init(&inputq); @@ -309,6 +388,18 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, /* Send according to determined transmit method */ if (dests->remote) { tipc_bcast_select_xmit_method(net, dests->remote, method); + + skb = skb_peek(pkts); + hdr = buf_msg(skb); + if (msg_user(hdr) == MSG_FRAGMENTER) + hdr = msg_get_wrapped(hdr); + msg_set_is_rcast(hdr, method->rcast); + + /* Switch method ? */ + if (rcast != method->rcast) + tipc_mcast_send_sync(net, skb, method, + dests, cong_link_cnt); + if (method->rcast) rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); else @@ -485,10 +576,63 @@ static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) return 0; } +static int tipc_bc_link_set_broadcast_mode(struct net *net, u32 bc_mode) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + + switch (bc_mode) { + case BCLINK_MODE_BCAST: + if (!bb->bcast_support) + return -ENOPROTOOPT; + + bb->force_bcast = true; + bb->force_rcast = false; + break; + case BCLINK_MODE_RCAST: + if (!bb->rcast_support) + return -ENOPROTOOPT; + + bb->force_bcast = false; + bb->force_rcast = true; + break; + case BCLINK_MODE_SEL: + if (!bb->bcast_support || !bb->rcast_support) + return -ENOPROTOOPT; + + bb->force_bcast = false; + bb->force_rcast = false; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int tipc_bc_link_set_broadcast_ratio(struct net *net, u32 bc_ratio) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + + if (!bb->bcast_support || !bb->rcast_support) + return -ENOPROTOOPT; + + if (bc_ratio > 100 || bc_ratio <= 0) + return -EINVAL; + + bb->rc_ratio = bc_ratio; + tipc_bcast_lock(net); + tipc_bcbase_calc_bc_threshold(net); + tipc_bcast_unlock(net); + + return 0; +} + int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) { int err; u32 win; + u32 bc_mode; + u32 bc_ratio; struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; if (!attrs[TIPC_NLA_LINK_PROP]) @@ -498,12 +642,28 @@ int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) if (err) return err; - if (!props[TIPC_NLA_PROP_WIN]) + if (!props[TIPC_NLA_PROP_WIN] && + !props[TIPC_NLA_PROP_BROADCAST] && + !props[TIPC_NLA_PROP_BROADCAST_RATIO]) { return -EOPNOTSUPP; + } + + if (props[TIPC_NLA_PROP_BROADCAST]) { + bc_mode = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST]); + err = tipc_bc_link_set_broadcast_mode(net, bc_mode); + } + + if (!err && props[TIPC_NLA_PROP_BROADCAST_RATIO]) { + bc_ratio = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST_RATIO]); + err = tipc_bc_link_set_broadcast_ratio(net, bc_ratio); + } - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (!err && props[TIPC_NLA_PROP_WIN]) { + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + err = tipc_bc_link_set_queue_limits(net, win); + } - return tipc_bc_link_set_queue_limits(net, win); + return err; } int tipc_bcast_init(struct net *net) @@ -529,7 +689,7 @@ int tipc_bcast_init(struct net *net) goto enomem; bb->link = l; tn->bcl = l; - bb->rc_ratio = 25; + bb->rc_ratio = 10; bb->rcast_support = true; return 0; enomem: @@ -576,3 +736,108 @@ void tipc_nlist_purge(struct tipc_nlist *nl) nl->remote = 0; nl->local = false; } + +u32 tipc_bcast_get_broadcast_mode(struct net *net) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + + if (bb->force_bcast) + return BCLINK_MODE_BCAST; + + if (bb->force_rcast) + return BCLINK_MODE_RCAST; + + if (bb->bcast_support && bb->rcast_support) + return BCLINK_MODE_SEL; + + return 0; +} + +u32 tipc_bcast_get_broadcast_ratio(struct net *net) +{ + struct tipc_bc_base *bb = tipc_bc_base(net); + + return bb->rc_ratio; +} + +void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, + struct sk_buff_head *inputq) +{ + struct sk_buff *skb, *_skb, *tmp; + struct tipc_msg *hdr, *_hdr; + bool match = false; + u32 node, port; + + skb = skb_peek(inputq); + if (!skb) + return; + + hdr = buf_msg(skb); + + if (likely(!msg_is_syn(hdr) && skb_queue_empty(defq))) + return; + + node = msg_orignode(hdr); + if (node == tipc_own_addr(net)) + return; + + port = msg_origport(hdr); + + /* Has the twin SYN message already arrived ? */ + skb_queue_walk(defq, _skb) { + _hdr = buf_msg(_skb); + if (msg_orignode(_hdr) != node) + continue; + if (msg_origport(_hdr) != port) + continue; + match = true; + break; + } + + if (!match) { + if (!msg_is_syn(hdr)) + return; + __skb_dequeue(inputq); + __skb_queue_tail(defq, skb); + return; + } + + /* Deliver non-SYN message from other link, otherwise queue it */ + if (!msg_is_syn(hdr)) { + if (msg_is_rcast(hdr) != msg_is_rcast(_hdr)) + return; + __skb_dequeue(inputq); + __skb_queue_tail(defq, skb); + return; + } + + /* Queue non-SYN/SYN message from same link */ + if (msg_is_rcast(hdr) == msg_is_rcast(_hdr)) { + __skb_dequeue(inputq); + __skb_queue_tail(defq, skb); + return; + } + + /* Matching SYN messages => return the one with data, if any */ + __skb_unlink(_skb, defq); + if (msg_data_sz(hdr)) { + kfree_skb(_skb); + } else { + __skb_dequeue(inputq); + kfree_skb(skb); + __skb_queue_tail(inputq, _skb); + } + + /* Deliver subsequent non-SYN messages from same peer */ + skb_queue_walk_safe(defq, _skb, tmp) { + _hdr = buf_msg(_skb); + if (msg_orignode(_hdr) != node) + continue; + if (msg_origport(_hdr) != port) + continue; + if (msg_is_syn(_hdr)) + break; + __skb_unlink(_skb, defq); + __skb_queue_tail(inputq, _skb); + } +} diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 751530ab0c49..dadad953e2be 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -48,6 +48,10 @@ extern const char tipc_bclink_name[]; #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) +#define BCLINK_MODE_BCAST 0x1 +#define BCLINK_MODE_RCAST 0x2 +#define BCLINK_MODE_SEL 0x4 + struct tipc_nlist { struct list_head list; u32 self; @@ -63,11 +67,13 @@ void tipc_nlist_del(struct tipc_nlist *nl, u32 node); /* Cookie to be used between socket and broadcast layer * @rcast: replicast (instead of broadcast) was used at previous xmit * @mandatory: broadcast/replicast indication was set by user + * @deferredq: defer queue to make message in order * @expires: re-evaluate non-mandatory transmit method if we are past this */ struct tipc_mc_method { bool rcast; bool mandatory; + struct sk_buff_head deferredq; unsigned long expires; }; @@ -92,6 +98,12 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net); +u32 tipc_bcast_get_broadcast_mode(struct net *net); +u32 tipc_bcast_get_broadcast_ratio(struct net *net); + +void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, + struct sk_buff_head *inputq); + static inline void tipc_bcast_lock(struct net *net) { spin_lock_bh(&tipc_net(net)->bclock); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d27f30a9a01d..2bed6589f41e 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -687,14 +687,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_BEARER); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER); if (!attrs) goto msg_full; if (nla_put_string(msg->skb, TIPC_NLA_BEARER_NAME, bearer->name)) goto attr_msg_full; - prop = nla_nest_start(msg->skb, TIPC_NLA_BEARER_PROP); + prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_PROP); if (!prop) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, bearer->priority)) @@ -776,9 +776,9 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, - info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -825,9 +825,9 @@ int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, - info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -870,9 +870,9 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, - info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -921,9 +921,9 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, - info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -964,9 +964,9 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, - info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -1033,14 +1033,14 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_MEDIA); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MEDIA); if (!attrs) goto msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MEDIA_NAME, media->name)) goto attr_msg_full; - prop = nla_nest_start(msg->skb, TIPC_NLA_MEDIA_PROP); + prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_MEDIA_PROP); if (!prop) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, media->priority)) @@ -1107,9 +1107,9 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, - info->attrs[TIPC_NLA_MEDIA], - tipc_nl_media_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MEDIA_MAX, + info->attrs[TIPC_NLA_MEDIA], + tipc_nl_media_policy, info->extack); if (err) return err; @@ -1155,9 +1155,9 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_MEDIA]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, - info->attrs[TIPC_NLA_MEDIA], - tipc_nl_media_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MEDIA_MAX, + info->attrs[TIPC_NLA_MEDIA], + tipc_nl_media_policy, info->extack); if (!attrs[TIPC_NLA_MEDIA_NAME]) return -EINVAL; diff --git a/net/tipc/core.c b/net/tipc/core.c index 5b38f5164281..27cccd101ef6 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -43,6 +43,7 @@ #include "net.h" #include "socket.h" #include "bcast.h" +#include "node.h" #include <linux/module.h> @@ -59,6 +60,7 @@ static int __net_init tipc_init_net(struct net *net) tn->node_addr = 0; tn->trial_addr = 0; tn->addr_trial_end = 0; + tn->capabilities = TIPC_NODE_CAPABILITIES; memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; diff --git a/net/tipc/core.h b/net/tipc/core.h index 8020a6c360ff..7a68e1b6a066 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -122,6 +122,9 @@ struct tipc_net { /* Topology subscription server */ struct tipc_topsrv *topsrv; atomic_t subscription_count; + + /* Cluster capabilities */ + u16 capabilities; }; static inline struct tipc_net *tipc_net(struct net *net) diff --git a/net/tipc/group.c b/net/tipc/group.c index 63f39201e41e..992be6113676 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -917,7 +917,7 @@ void tipc_group_member_evt(struct tipc_group *grp, int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { - struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP); + struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; diff --git a/net/tipc/link.c b/net/tipc/link.c index 341ecd796aa4..f5cd986e1e50 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -151,6 +151,7 @@ struct tipc_link { /* Failover/synch */ u16 drop_point; struct sk_buff *failover_reasm_skb; + struct sk_buff_head failover_deferdq; /* Max packet negotiation */ u16 mtu; @@ -209,6 +210,7 @@ enum { }; #define TIPC_BC_RETR_LIM msecs_to_jiffies(10) /* [ms] */ +#define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1)) /* * Interval between NACKs when packets arrive out of order @@ -246,6 +248,10 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data); +static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq); /* * Simple non-static link routines (i.e. referenced outside this file) @@ -493,6 +499,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, __skb_queue_head_init(&l->transmq); __skb_queue_head_init(&l->backlogq); __skb_queue_head_init(&l->deferdq); + __skb_queue_head_init(&l->failover_deferdq); skb_queue_head_init(&l->wakeupq); skb_queue_head_init(l->inputq); return true; @@ -869,6 +876,8 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_head_init(&list); l->in_session = false; + /* Force re-synch of peer session number before establishing */ + l->peer_session--; l->session++; l->mtu = l->advertised_mtu; @@ -883,6 +892,7 @@ void tipc_link_reset(struct tipc_link *l) __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); __skb_queue_purge(&l->backlogq); + __skb_queue_purge(&l->failover_deferdq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; l->backlog[TIPC_HIGH_IMPORTANCE].len = 0; @@ -1154,34 +1164,14 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, * Consumes buffer */ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, - struct sk_buff_head *inputq) + struct sk_buff_head *inputq, + struct sk_buff **reasm_skb) { struct tipc_msg *hdr = buf_msg(skb); - struct sk_buff **reasm_skb = &l->reasm_buf; struct sk_buff *iskb; struct sk_buff_head tmpq; int usr = msg_user(hdr); - int rc = 0; int pos = 0; - int ipos = 0; - - if (unlikely(usr == TUNNEL_PROTOCOL)) { - if (msg_type(hdr) == SYNCH_MSG) { - __skb_queue_purge(&l->deferdq); - goto drop; - } - if (!tipc_msg_extract(skb, &iskb, &ipos)) - return rc; - kfree_skb(skb); - skb = iskb; - hdr = buf_msg(skb); - if (less(msg_seqno(hdr), l->drop_point)) - goto drop; - if (tipc_data_input(l, skb, inputq)) - return rc; - usr = msg_user(hdr); - reasm_skb = &l->failover_reasm_skb; - } if (usr == MSG_BUNDLER) { skb_queue_head_init(&tmpq); @@ -1206,11 +1196,66 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, tipc_link_bc_init_rcv(l->bc_rcvlink, hdr); tipc_bcast_unlock(l->net); } -drop: + kfree_skb(skb); return 0; } +/* tipc_link_tnl_rcv() - receive TUNNEL_PROTOCOL message, drop or process the + * inner message along with the ones in the old link's + * deferdq + * @l: tunnel link + * @skb: TUNNEL_PROTOCOL message + * @inputq: queue to put messages ready for delivery + */ +static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *inputq) +{ + struct sk_buff **reasm_skb = &l->failover_reasm_skb; + struct sk_buff_head *fdefq = &l->failover_deferdq; + struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff *iskb; + int ipos = 0; + int rc = 0; + u16 seqno; + + /* SYNCH_MSG */ + if (msg_type(hdr) == SYNCH_MSG) + goto drop; + + /* FAILOVER_MSG */ + if (!tipc_msg_extract(skb, &iskb, &ipos)) { + pr_warn_ratelimited("Cannot extract FAILOVER_MSG, defq: %d\n", + skb_queue_len(fdefq)); + return rc; + } + + do { + seqno = buf_seqno(iskb); + + if (unlikely(less(seqno, l->drop_point))) { + kfree_skb(iskb); + continue; + } + + if (unlikely(seqno != l->drop_point)) { + __tipc_skb_queue_sorted(fdefq, seqno, iskb); + continue; + } + + l->drop_point++; + + if (!tipc_data_input(l, iskb, inputq)) + rc |= tipc_link_input(l, iskb, inputq, reasm_skb); + if (unlikely(rc)) + break; + } while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point))); + +drop: + kfree_skb(skb); + return rc; +} + static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) { bool released = false; @@ -1226,6 +1271,106 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) return released; } +/* tipc_build_gap_ack_blks - build Gap ACK blocks + * @l: tipc link that data have come with gaps in sequence if any + * @data: data buffer to store the Gap ACK blocks after built + * + * returns the actual allocated memory size + */ +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data) +{ + struct sk_buff *skb = skb_peek(&l->deferdq); + struct tipc_gap_ack_blks *ga = data; + u16 len, expect, seqno = 0; + u8 n = 0; + + if (!skb) + goto exit; + + expect = buf_seqno(skb); + skb_queue_walk(&l->deferdq, skb) { + seqno = buf_seqno(skb); + if (unlikely(more(seqno, expect))) { + ga->gacks[n].ack = htons(expect - 1); + ga->gacks[n].gap = htons(seqno - expect); + if (++n >= MAX_GAP_ACK_BLKS) { + pr_info_ratelimited("Too few Gap ACK blocks!\n"); + goto exit; + } + } else if (unlikely(less(seqno, expect))) { + pr_warn("Unexpected skb in deferdq!\n"); + continue; + } + expect = seqno + 1; + } + + /* last block */ + ga->gacks[n].ack = htons(seqno); + ga->gacks[n].gap = 0; + n++; + +exit: + len = tipc_gap_ack_blks_sz(n); + ga->len = htons(len); + ga->gack_cnt = n; + return len; +} + +/* tipc_link_advance_transmq - advance TIPC link transmq queue by releasing + * acked packets, also doing retransmissions if + * gaps found + * @l: tipc link with transmq queue to be advanced + * @acked: seqno of last packet acked by peer without any gaps before + * @gap: # of gap packets + * @ga: buffer pointer to Gap ACK blocks from peer + * @xmitq: queue for accumulating the retransmitted packets if any + */ +static void tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb, *_skb, *tmp; + struct tipc_msg *hdr; + u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 ack = l->rcv_nxt - 1; + u16 seqno; + u16 n = 0; + + skb_queue_walk_safe(&l->transmq, skb, tmp) { + seqno = buf_seqno(skb); + +next_gap_ack: + if (less_eq(seqno, acked)) { + /* release skb */ + __skb_unlink(skb, &l->transmq); + kfree_skb(skb); + } else if (less_eq(seqno, acked + gap)) { + /* retransmit skb */ + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) + continue; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; + + _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + if (!_skb) + continue; + hdr = buf_msg(_skb); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_ack); + _skb->priority = TC_PRIO_CONTROL; + __skb_queue_tail(xmitq, _skb); + l->stats.retransmitted++; + } else { + /* retry with Gap ACK blocks if any */ + if (!ga || n >= ga->gack_cnt) + break; + acked = ntohs(ga->gacks[n].ack); + gap = ntohs(ga->gacks[n].gap); + n++; + goto next_gap_ack; + } + } +} + /* tipc_link_build_state_msg: prepare link state message for transmission * * Note that sending of broadcast ack is coordinated among nodes, to reduce @@ -1280,6 +1425,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; + u32 defq_len = skb_queue_len(&l->deferdq); int match1, match2; if (link_is_bc_rcvlink(l)) { @@ -1290,7 +1436,7 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, return 0; } - if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) + if (defq_len >= 3 && !((defq_len - 3) % 16)) tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } @@ -1304,29 +1450,29 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct sk_buff_head *defq = &l->deferdq; - struct tipc_msg *hdr; + struct tipc_msg *hdr = buf_msg(skb); u16 seqno, rcv_nxt, win_lim; int rc = 0; + /* Verify and update link state */ + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + return tipc_link_proto_rcv(l, skb, xmitq); + + /* Don't send probe at next timeout expiration */ + l->silent_intv_cnt = 0; + do { hdr = buf_msg(skb); seqno = msg_seqno(hdr); rcv_nxt = l->rcv_nxt; win_lim = rcv_nxt + TIPC_MAX_LINK_WIN; - /* Verify and update link state */ - if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) - return tipc_link_proto_rcv(l, skb, xmitq); - if (unlikely(!link_is_up(l))) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; goto drop; } - /* Don't send probe at next timeout expiration */ - l->silent_intv_cnt = 0; - /* Drop if outside receive window */ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { l->stats.duplicates++; @@ -1351,13 +1497,16 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Deliver packet */ l->rcv_nxt++; l->stats.recv_pkts++; - if (!tipc_data_input(l, skb, l->inputq)) - rc |= tipc_link_input(l, skb, l->inputq); + + if (unlikely(msg_user(hdr) == TUNNEL_PROTOCOL)) + rc |= tipc_link_tnl_rcv(l, skb, l->inputq); + else if (!tipc_data_input(l, skb, l->inputq)) + rc |= tipc_link_input(l, skb, l->inputq, &l->reasm_buf); if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) rc |= tipc_link_build_state_msg(l, xmitq); if (unlikely(rc & ~TIPC_LINK_SND_STATE)) break; - } while ((skb = __skb_dequeue(defq))); + } while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt))); return rc; drop: @@ -1378,6 +1527,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, struct tipc_mon_state *mstate = &l->mon_state; int dlen = 0; void *data; + u16 glen = 0; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1390,8 +1540,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, - tipc_max_domain_size, l->addr, - tipc_own_addr(l->net), 0, 0, 0); + tipc_max_domain_size + MAX_GAP_ACK_BLKS_SZ, + l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return; @@ -1418,9 +1568,11 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); - tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); - msg_set_size(hdr, INT_H_SIZE + dlen); - skb_trim(skb, INT_H_SIZE + dlen); + if (l->peer_caps & TIPC_GAP_ACK_BLOCK) + glen = tipc_build_gap_ack_blks(l, data); + tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); + msg_set_size(hdr, INT_H_SIZE + glen + dlen); + skb_trim(skb, INT_H_SIZE + glen + dlen); l->stats.sent_states++; l->rcv_unacked = 0; } else { @@ -1479,6 +1631,7 @@ void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq) { + struct sk_buff_head *fdefq = &tnl->failover_deferdq; struct sk_buff *skb, *tnlskb; struct tipc_msg *hdr, tnlhdr; struct sk_buff_head *queue = &l->transmq; @@ -1506,7 +1659,11 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, /* Initialize reusable tunnel packet header */ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); - pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); + if (mtyp == SYNCH_MSG) + pktcnt = l->snd_nxt - buf_seqno(skb_peek(&l->transmq)); + else + pktcnt = skb_queue_len(&l->transmq); + pktcnt += skb_queue_len(&l->backlogq); msg_set_msgcnt(&tnlhdr, pktcnt); msg_set_bearer_id(&tnlhdr, l->peer_bearer_id); tnl: @@ -1537,6 +1694,49 @@ tnl: tnl->drop_point = l->rcv_nxt; tnl->failover_reasm_skb = l->reasm_buf; l->reasm_buf = NULL; + + /* Failover the link's deferdq */ + if (unlikely(!skb_queue_empty(fdefq))) { + pr_warn("Link failover deferdq not empty: %d!\n", + skb_queue_len(fdefq)); + __skb_queue_purge(fdefq); + } + skb_queue_splice_init(&l->deferdq, fdefq); + } +} + +/** + * tipc_link_failover_prepare() - prepare tnl for link failover + * + * This is a special version of the precursor - tipc_link_tnl_prepare(), + * see the tipc_node_link_failover() for details + * + * @l: failover link + * @tnl: tunnel link + * @xmitq: queue for messages to be xmited + */ +void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, + struct sk_buff_head *xmitq) +{ + struct sk_buff_head *fdefq = &tnl->failover_deferdq; + + tipc_link_create_dummy_tnl_msg(tnl, xmitq); + + /* This failover link enpoint was never established before, + * so it has not received anything from peer. + * Otherwise, it must be a normal failover situation or the + * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes + * would have to start over from scratch instead. + */ + WARN_ON(l && tipc_link_is_up(l)); + tnl->drop_point = 1; + tnl->failover_reasm_skb = NULL; + + /* Initiate the link's failover deferdq */ + if (unlikely(!skb_queue_empty(fdefq))) { + pr_warn("Link failover deferdq not empty: %d!\n", + skb_queue_len(fdefq)); + __skb_queue_purge(fdefq); } } @@ -1590,6 +1790,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); + struct tipc_gap_ack_blks *ga = NULL; u16 rcvgap = 0; u16 ack = msg_ack(hdr); u16 gap = msg_seq_gap(hdr); @@ -1600,6 +1801,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); bool reply = msg_probe(hdr); + u16 glen = 0; void *data; char *if_name; int rc = 0; @@ -1697,7 +1899,17 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; break; } - tipc_mon_rcv(l->net, data, dlen, l->addr, + + /* Receive Gap ACK blocks from peer if any */ + if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { + ga = (struct tipc_gap_ack_blks *)data; + glen = ntohs(ga->len); + /* sanity check: if failed, ignore Gap ACK blocks */ + if (glen != tipc_gap_ack_blks_sz(ga->gack_cnt)) + ga = NULL; + } + + tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ @@ -1706,13 +1918,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - tipc_link_release_pkts(l, ack); + + tipc_link_advance_transmq(l, ack, gap, ga, xmitq); /* If NACK, retransmit will now start at right position */ - if (gap) { - rc = tipc_link_retrans(l, l, ack + 1, ack + gap, xmitq); + if (gap) l->stats.recv_nacks++; - } tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) @@ -1972,8 +2183,8 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) { int err; - err = nla_parse_nested(props, TIPC_NLA_PROP_MAX, prop, - tipc_nl_prop_policy, NULL); + err = nla_parse_nested_deprecated(props, TIPC_NLA_PROP_MAX, prop, + tipc_nl_prop_policy, NULL); if (err) return err; @@ -2052,7 +2263,7 @@ static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) (s->accu_queue_sz / s->queue_sz_counts) : 0} }; - stats = nla_nest_start(skb, TIPC_NLA_LINK_STATS); + stats = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS); if (!stats) return -EMSGSIZE; @@ -2084,7 +2295,7 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK); if (!attrs) goto msg_full; @@ -2106,7 +2317,7 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE)) goto attr_msg_full; - prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); + prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) @@ -2173,7 +2384,7 @@ static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, (stats->accu_queue_sz / stats->queue_sz_counts) : 0} }; - nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS); + nest = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS); if (!nest) return -EMSGSIZE; @@ -2197,6 +2408,8 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) struct nlattr *attrs; struct nlattr *prop; struct tipc_net *tn = net_generic(net, tipc_net_id); + u32 bc_mode = tipc_bcast_get_broadcast_mode(net); + u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); struct tipc_link *bcl = tn->bcl; if (!bcl) @@ -2211,7 +2424,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) return -EMSGSIZE; } - attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK); if (!attrs) goto msg_full; @@ -2228,11 +2441,17 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0)) goto attr_msg_full; - prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); + prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) + goto prop_msg_full; + if (bc_mode & BCLINK_MODE_SEL) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST_RATIO, + bc_ratio)) + goto prop_msg_full; nla_nest_end(msg->skb, prop); err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats); diff --git a/net/tipc/link.h b/net/tipc/link.h index 8439e0ee53a8..adcad65e761c 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -90,6 +90,8 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl, struct sk_buff_head *xmitq); +void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, + struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); bool tipc_link_is_up(struct tipc_link *l); diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 67f69389ec17..6a6eae88442f 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -696,7 +696,7 @@ static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_MON_PEER); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; @@ -785,7 +785,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d7e4b8b93f9d..8de02ad6e352 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -117,6 +117,37 @@ struct tipc_msg { __be32 hdr[15]; }; +/* struct tipc_gap_ack - TIPC Gap ACK block + * @ack: seqno of the last consecutive packet in link deferdq + * @gap: number of gap packets since the last ack + * + * E.g: + * link deferdq: 1 2 3 4 10 11 13 14 15 20 + * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> + */ +struct tipc_gap_ack { + __be16 ack; + __be16 gap; +}; + +/* struct tipc_gap_ack_blks + * @len: actual length of the record + * @gack_cnt: number of Gap ACK blocks in the record + * @gacks: array of Gap ACK blocks + */ +struct tipc_gap_ack_blks { + __be16 len; + u8 gack_cnt; + u8 reserved; + struct tipc_gap_ack gacks[]; +}; + +#define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \ + sizeof(struct tipc_gap_ack) * (n)) + +#define MAX_GAP_ACK_BLKS 32 +#define MAX_GAP_ACK_BLKS_SZ tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS) + static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; @@ -257,6 +288,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) msg_set_bits(m, 0, 18, 1, d); } +static inline bool msg_is_rcast(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 0x1); +} + +static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) +{ + msg_set_bits(m, 0, 18, 0x1, d); +} + static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); @@ -1110,4 +1151,25 @@ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, tipc_skb_queue_splice_tail(&tmp, head); } +/* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno + * @list: list to be dequeued from + * @seqno: seqno of the expected msg + * + * returns skb dequeued from the list if its seqno is less than or equal to + * the expected one, otherwise the skb is still hold + * + * Note: must be used with appropriate locks held only + */ +static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, + u16 seqno) +{ + struct sk_buff *skb = skb_peek(list); + + if (skb && less_eq(buf_seqno(skb), seqno)) { + __skb_unlink(skb, list); + return skb; + } + return NULL; +} + #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bff241f03525..66a65c2cdb23 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -829,11 +829,11 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; - b = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); + b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; @@ -909,7 +909,8 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; - if (*last_type) { + if (*last_type || + (!i && *last_key && (*last_lower == *last_key))) { service = tipc_service_find(net, *last_type); if (!service) return -EPIPE; diff --git a/net/tipc/net.c b/net/tipc/net.c index 7ce1e86b024f..85707c185360 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -187,7 +187,7 @@ static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NET); if (!attrs) goto msg_full; @@ -245,9 +245,9 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, - info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, - info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy, info->extack); if (err) return err; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 99ee419210ba..99bd166bccec 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -110,7 +110,9 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } + [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_BROADCAST] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_BROADCAST_RATIO] = { .type = NLA_U32 } }; const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { @@ -141,115 +143,115 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { static const struct genl_ops tipc_genl_v2_ops[] = { { .cmd = TIPC_NL_BEARER_DISABLE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_bearer_disable, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_BEARER_ENABLE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_bearer_enable, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_BEARER_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_bearer_get, .dumpit = tipc_nl_bearer_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_BEARER_ADD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_bearer_add, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_BEARER_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_bearer_set, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_SOCK_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = tipc_dump_start, .dumpit = tipc_nl_sk_dump, .done = tipc_dump_done, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_PUBL_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_nl_publ_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_LINK_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_node_get_link, .dumpit = tipc_nl_node_dump_link, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_LINK_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_node_set_link, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_LINK_RESET_STATS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_node_reset_link_stats, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_MEDIA_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_media_get, .dumpit = tipc_nl_media_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_MEDIA_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_media_set, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_NODE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_nl_node_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_NET_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_nl_net_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_NET_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_net_set, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_NAME_TABLE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_nl_name_table_dump, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_MON_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_node_set_monitor, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_MON_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_node_get_monitor, .dumpit = tipc_nl_node_dump_monitor, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_MON_PEER_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_nl_node_dump_monitor_peer, - .policy = tipc_nl_policy, }, { .cmd = TIPC_NL_PEER_REMOVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_peer_rm, - .policy = tipc_nl_policy, }, #ifdef CONFIG_TIPC_MEDIA_UDP { .cmd = TIPC_NL_UDP_GET_REMOTEIP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = tipc_udp_nl_dump_remoteip, - .policy = tipc_nl_policy, }, #endif }; @@ -259,6 +261,7 @@ struct genl_family tipc_genl_family __ro_after_init = { .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, .maxattr = TIPC_NLA_MAX, + .policy = tipc_nl_policy, .netnsok = true, .module = THIS_MODULE, .ops = tipc_genl_v2_ops, @@ -273,8 +276,8 @@ int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) if (!*attr) return -EOPNOTSUPP; - return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy, - NULL); + return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr, + tipc_nl_policy, NULL); } int __init tipc_netlink_start(void) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 4ad3586da8f0..c6a04c09d075 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -267,8 +267,14 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, if (msg->rep_type) tipc_tlv_init(msg->rep, msg->rep_type); - if (cmd->header) - (*cmd->header)(msg); + if (cmd->header) { + err = (*cmd->header)(msg); + if (err) { + kfree_skb(msg->rep); + msg->rep = NULL; + return err; + } + } arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { @@ -322,9 +328,9 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (err) goto doit_out; - err = nla_parse(attrbuf, tipc_genl_family.maxattr, - (const struct nlattr *)trans_buf->data, - trans_buf->len, NULL, NULL); + err = nla_parse_deprecated(attrbuf, tipc_genl_family.maxattr, + (const struct nlattr *)trans_buf->data, + trans_buf->len, NULL, NULL); if (err) goto doit_out; @@ -372,8 +378,8 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, - attrs[TIPC_NLA_BEARER], NULL, NULL); + err = nla_parse_nested_deprecated(bearer, TIPC_NLA_BEARER_MAX, + attrs[TIPC_NLA_BEARER], NULL, NULL); if (err) return err; @@ -393,11 +399,16 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, b = (struct tipc_bearer_config *)TLV_DATA(msg->req); - bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_BEARER_NAME); + len = TLV_GET_DATA_LEN(msg->req); + len -= offsetof(struct tipc_bearer_config, name); + if (len <= 0) + return -EINVAL; + + len = min_t(int, len, TIPC_MAX_BEARER_NAME); if (!string_is_valid(b->name, len)) return -EINVAL; @@ -408,7 +419,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd, return -EMSGSIZE; if (ntohl(b->priority) <= TIPC_MAX_LINK_PRI) { - prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP); + prop = nla_nest_start_noflag(skb, TIPC_NLA_BEARER_PROP); if (!prop) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(b->priority))) @@ -430,7 +441,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd, name = (char *)TLV_DATA(msg->req); - bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; @@ -503,24 +514,26 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_LINK]) return -EINVAL; - err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], - NULL, NULL); + err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, + attrs[TIPC_NLA_LINK], NULL, NULL); if (err) return err; if (!link[TIPC_NLA_LINK_PROP]) return -EINVAL; - err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX, - link[TIPC_NLA_LINK_PROP], NULL, NULL); + err = nla_parse_nested_deprecated(prop, TIPC_NLA_PROP_MAX, + link[TIPC_NLA_LINK_PROP], NULL, + NULL); if (err) return err; if (!link[TIPC_NLA_LINK_STATS]) return -EINVAL; - err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX, - link[TIPC_NLA_LINK_STATS], NULL, NULL); + err = nla_parse_nested_deprecated(stats, TIPC_NLA_STATS_MAX, + link[TIPC_NLA_LINK_STATS], NULL, + NULL); if (err) return err; @@ -634,8 +647,8 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_LINK]) return -EINVAL; - err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], - NULL, NULL); + err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, + attrs[TIPC_NLA_LINK], NULL, NULL); if (err) return err; @@ -674,7 +687,7 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, lc = (struct tipc_link_config *)TLV_DATA(msg->req); - media = nla_nest_start(skb, TIPC_NLA_MEDIA); + media = nla_nest_start_noflag(skb, TIPC_NLA_MEDIA); if (!media) return -EMSGSIZE; @@ -685,7 +698,7 @@ static int tipc_nl_compat_media_set(struct sk_buff *skb, if (nla_put_string(skb, TIPC_NLA_MEDIA_NAME, lc->name)) return -EMSGSIZE; - prop = nla_nest_start(skb, TIPC_NLA_MEDIA_PROP); + prop = nla_nest_start_noflag(skb, TIPC_NLA_MEDIA_PROP); if (!prop) return -EMSGSIZE; @@ -706,7 +719,7 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, lc = (struct tipc_link_config *)TLV_DATA(msg->req); - bearer = nla_nest_start(skb, TIPC_NLA_BEARER); + bearer = nla_nest_start_noflag(skb, TIPC_NLA_BEARER); if (!bearer) return -EMSGSIZE; @@ -717,7 +730,7 @@ static int tipc_nl_compat_bearer_set(struct sk_buff *skb, if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, lc->name)) return -EMSGSIZE; - prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP); + prop = nla_nest_start_noflag(skb, TIPC_NLA_BEARER_PROP); if (!prop) return -EMSGSIZE; @@ -737,14 +750,14 @@ static int __tipc_nl_compat_link_set(struct sk_buff *skb, lc = (struct tipc_link_config *)TLV_DATA(msg->req); - link = nla_nest_start(skb, TIPC_NLA_LINK); + link = nla_nest_start_noflag(skb, TIPC_NLA_LINK); if (!link) return -EMSGSIZE; if (nla_put_string(skb, TIPC_NLA_LINK_NAME, lc->name)) return -EMSGSIZE; - prop = nla_nest_start(skb, TIPC_NLA_LINK_PROP); + prop = nla_nest_start_noflag(skb, TIPC_NLA_LINK_PROP); if (!prop) return -EMSGSIZE; @@ -766,7 +779,12 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, lc = (struct tipc_link_config *)TLV_DATA(msg->req); - len = min_t(int, TLV_GET_DATA_LEN(msg->req), TIPC_MAX_LINK_NAME); + len = TLV_GET_DATA_LEN(msg->req); + len -= offsetof(struct tipc_link_config, name); + if (len <= 0) + return -EINVAL; + + len = min_t(int, len, TIPC_MAX_LINK_NAME); if (!string_is_valid(lc->name, len)) return -EINVAL; @@ -795,7 +813,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd, name = (char *)TLV_DATA(msg->req); - link = nla_nest_start(skb, TIPC_NLA_LINK); + link = nla_nest_start_noflag(skb, TIPC_NLA_LINK); if (!link) return -EMSGSIZE; @@ -853,16 +871,18 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_NAME_TABLE]) return -EINVAL; - err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX, - attrs[TIPC_NLA_NAME_TABLE], NULL, NULL); + err = nla_parse_nested_deprecated(nt, TIPC_NLA_NAME_TABLE_MAX, + attrs[TIPC_NLA_NAME_TABLE], NULL, + NULL); if (err) return err; if (!nt[TIPC_NLA_NAME_TABLE_PUBL]) return -EINVAL; - err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, - nt[TIPC_NLA_NAME_TABLE_PUBL], NULL, NULL); + err = nla_parse_nested_deprecated(publ, TIPC_NLA_PUBL_MAX, + nt[TIPC_NLA_NAME_TABLE_PUBL], NULL, + NULL); if (err) return err; @@ -921,8 +941,8 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_PUBL]) return -EINVAL; - err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], - NULL, NULL); + err = nla_parse_nested_deprecated(publ, TIPC_NLA_PUBL_MAX, + attrs[TIPC_NLA_PUBL], NULL, NULL); if (err) return err; @@ -957,7 +977,7 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) return -EMSGSIZE; } - nest = nla_nest_start(args, TIPC_NLA_SOCK); + nest = nla_nest_start_noflag(args, TIPC_NLA_SOCK); if (!nest) { kfree_skb(args); return -EMSGSIZE; @@ -991,8 +1011,8 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; - err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], - NULL, NULL); + err = nla_parse_nested_deprecated(sock, TIPC_NLA_SOCK_MAX, + attrs[TIPC_NLA_SOCK], NULL, NULL); if (err) return err; @@ -1003,8 +1023,9 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, u32 node; struct nlattr *con[TIPC_NLA_CON_MAX + 1]; - err = nla_parse_nested(con, TIPC_NLA_CON_MAX, - sock[TIPC_NLA_SOCK_CON], NULL, NULL); + err = nla_parse_nested_deprecated(con, TIPC_NLA_CON_MAX, + sock[TIPC_NLA_SOCK_CON], + NULL, NULL); if (err) return err; @@ -1043,8 +1064,8 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_MEDIA]) return -EINVAL; - err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, - attrs[TIPC_NLA_MEDIA], NULL, NULL); + err = nla_parse_nested_deprecated(media, TIPC_NLA_MEDIA_MAX, + attrs[TIPC_NLA_MEDIA], NULL, NULL); if (err) return err; @@ -1063,8 +1084,8 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_NODE]) return -EINVAL; - err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], - NULL, NULL); + err = nla_parse_nested_deprecated(node, TIPC_NLA_NODE_MAX, + attrs[TIPC_NLA_NODE], NULL, NULL); if (err) return err; @@ -1084,7 +1105,7 @@ static int tipc_nl_compat_net_set(struct tipc_nl_compat_cmd_doit *cmd, val = ntohl(*(__be32 *)TLV_DATA(msg->req)); - net = nla_nest_start(skb, TIPC_NLA_NET); + net = nla_nest_start_noflag(skb, TIPC_NLA_NET); if (!net) return -EMSGSIZE; @@ -1110,8 +1131,8 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_NET]) return -EINVAL; - err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], - NULL, NULL); + err = nla_parse_nested_deprecated(net, TIPC_NLA_NET_MAX, + attrs[TIPC_NLA_NET], NULL, NULL); if (err) return err; @@ -1284,6 +1305,7 @@ send: static const struct genl_ops tipc_genl_compat_ops[] = { { .cmd = TIPC_GENL_CMD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tipc_nl_compat_recv, }, }; diff --git a/net/tipc/node.c b/net/tipc/node.c index dd3b6dc17662..9e106d3ed187 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -375,14 +375,20 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ - write_lock_bh(&n->lock); + tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } - write_unlock_bh(&n->lock); + tipc_node_write_unlock_fast(n); + + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -433,6 +439,11 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, break; } list_add_tail_rcu(&n->list, &temp_node->list); + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -589,6 +600,7 @@ static void tipc_node_clear_links(struct tipc_node *node) */ static bool tipc_node_cleanup(struct tipc_node *peer) { + struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; @@ -604,6 +616,13 @@ static bool tipc_node_cleanup(struct tipc_node *peer) deleted = true; } tipc_node_write_unlock(peer); + + /* Calculate cluster capabilities */ + tn->capabilities = TIPC_NODE_CAPABILITIES; + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + tn->capabilities &= temp_node->capabilities; + } + spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -695,7 +714,6 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); - n->failover_sent = false; n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); @@ -738,6 +756,45 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, } /** + * tipc_node_link_failover() - start failover in case "half-failover" + * + * This function is only called in a very special situation where link + * failover can be already started on peer node but not on this node. + * This can happen when e.g. + * 1. Both links <1A-2A>, <1B-2B> down + * 2. Link endpoint 2A up, but 1A still down (e.g. due to network + * disturbance, wrong session, etc.) + * 3. Link <1B-2B> up + * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) + * 5. Node B starts failover onto link <1B-2B> + * + * ==> Node A does never start link/node failover! + * + * @n: tipc node structure + * @l: link peer endpoint failingover (- can be NULL) + * @tnl: tunnel link + * @xmitq: queue for messages to be xmited on tnl link later + */ +static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, + struct tipc_link *tnl, + struct sk_buff_head *xmitq) +{ + /* Avoid to be "self-failover" that can never end */ + if (!tipc_link_is_up(tnl)) + return; + + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); + tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); + + n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); + tipc_link_failover_prepare(l, tnl, xmitq); + + if (l) + tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); + tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); +} + +/** * __tipc_node_link_down - handle loss of link */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, @@ -1340,7 +1397,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_NODE); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; @@ -1656,14 +1713,16 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } + /* If parallel link was already down, and this happened before - * the tunnel link came up, FAILOVER was never sent. Ensure that - * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. + * the tunnel link came up, node failover was never started. + * Ensure that a FAILOVER_MSG is sent to get peer out of + * NODE_FAILINGOVER state, also this node must accept + * TUNNEL_MSGs from peer. */ - if (n->state != NODE_FAILINGOVER && !n->failover_sent) { - tipc_link_create_dummy_tnl_msg(l, xmitq); - n->failover_sent = true; - } + if (n->state != NODE_FAILINGOVER) + tipc_node_link_failover(n, pl, l, xmitq); + /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; @@ -1866,9 +1925,9 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, - info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, - info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy, info->extack); if (err) return err; @@ -2024,9 +2083,9 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, - info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy, info->extack); if (err) return err; @@ -2100,9 +2159,9 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, - info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy, info->extack); if (err) return err; @@ -2165,9 +2224,9 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, - info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy, info->extack); if (err) return err; @@ -2305,9 +2364,10 @@ int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; - err = nla_parse_nested(attrs, TIPC_NLA_MON_MAX, - info->attrs[TIPC_NLA_MON], - tipc_nl_monitor_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, + info->attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy, + info->extack); if (err) return err; @@ -2334,7 +2394,7 @@ static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) if (!hdr) return -EMSGSIZE; - attrs = nla_nest_start(msg->skb, TIPC_NLA_MON); + attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; @@ -2425,9 +2485,10 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, if (!attrs[TIPC_NLA_MON]) return -EINVAL; - err = nla_parse_nested(mon, TIPC_NLA_MON_MAX, - attrs[TIPC_NLA_MON], - tipc_nl_monitor_policy, NULL); + err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, + attrs[TIPC_NLA_MON], + tipc_nl_monitor_policy, + NULL); if (err) return err; diff --git a/net/tipc/node.h b/net/tipc/node.h index 4f59a30e989a..c0bf49ea3de4 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -51,7 +51,9 @@ enum { TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), TIPC_NODE_ID128 = (1 << 5), - TIPC_LINK_PROTO_SEQNO = (1 << 6) + TIPC_LINK_PROTO_SEQNO = (1 << 6), + TIPC_MCAST_RBCTL = (1 << 7), + TIPC_GAP_ACK_BLOCK = (1 << 8) }; #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \ @@ -60,7 +62,9 @@ enum { TIPC_BCAST_RCAST | \ TIPC_BLOCK_FLOWCTL | \ TIPC_NODE_ID128 | \ - TIPC_LINK_PROTO_SEQNO) + TIPC_LINK_PROTO_SEQNO | \ + TIPC_MCAST_RBCTL | \ + TIPC_GAP_ACK_BLOCK) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b542f14ed444..dd8537f988c4 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -485,6 +485,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) tsk_set_unreliable(tsk, true); + __skb_queue_head_init(&tsk->mc_method.deferredq); } trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " "); @@ -582,6 +583,7 @@ static int tipc_release(struct socket *sock) sk->sk_shutdown = SHUTDOWN_MASK; tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); + __skb_queue_purge(&tsk->mc_method.deferredq); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); @@ -734,11 +736,11 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: - case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= EPOLLOUT; /* fall through */ case TIPC_LISTEN: + case TIPC_CONNECTING: if (!skb_queue_empty(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; break; @@ -2041,7 +2043,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (msg_data_sz(hdr)) return true; /* Empty ACK-, - wake up sleeping connect() and drop */ - sk->sk_data_ready(sk); + sk->sk_state_change(sk); msg_set_dest_droppable(hdr, 1); return false; } @@ -2149,6 +2151,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); struct net *net = sock_net(sk); struct sk_buff_head inputq; + int mtyp = msg_type(hdr); int limit, err = TIPC_OK; trace_tipc_sk_filter_rcv(sk, skb, TIPC_DUMP_ALL, " "); @@ -2162,6 +2165,9 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, if (unlikely(grp)) tipc_group_filter_msg(grp, &inputq, xmitq); + if (unlikely(!grp) && mtyp == TIPC_MCAST_MSG) + tipc_mcast_filter_msg(net, &tsk->mc_method.deferredq, &inputq); + /* Validate and add to receive buffer if there is space */ while ((skb = __skb_dequeue(&inputq))) { hdr = buf_msg(skb); @@ -3064,6 +3070,9 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_SOCK_RECVQ_USED: + value = sk_rmem_alloc_get(sk); + break; case TIPC_GROUP_JOIN: seq.type = 0; if (tsk->group) @@ -3264,7 +3273,7 @@ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) peer_node = tsk_peer_node(tsk); peer_port = tsk_peer_port(tsk); - nest = nla_nest_start(skb, TIPC_NLA_SOCK_CON); + nest = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_CON); if (!nest) return -EMSGSIZE; @@ -3323,7 +3332,7 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, if (!hdr) goto msg_cancel; - attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + attrs = nla_nest_start_noflag(skb, TIPC_NLA_SOCK); if (!attrs) goto genlmsg_cancel; @@ -3428,7 +3437,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, if (!(sk_filter_state & (1 << sk->sk_state))) return 0; - attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + attrs = nla_nest_start_noflag(skb, TIPC_NLA_SOCK); if (!attrs) goto msg_cancel; @@ -3446,7 +3455,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, TIPC_NLA_SOCK_PAD)) goto attr_msg_cancel; - stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT); + stat = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_STAT); if (!stat) goto attr_msg_cancel; @@ -3503,7 +3512,7 @@ static int __tipc_nl_add_sk_publ(struct sk_buff *skb, if (!hdr) goto msg_cancel; - attrs = nla_nest_start(skb, TIPC_NLA_PUBL); + attrs = nla_nest_start_noflag(skb, TIPC_NLA_PUBL); if (!attrs) goto genlmsg_cancel; @@ -3590,9 +3599,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; - err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, - attrs[TIPC_NLA_SOCK], - tipc_nl_sock_policy, NULL); + err = nla_parse_nested_deprecated(sock, TIPC_NLA_SOCK_MAX, + attrs[TIPC_NLA_SOCK], + tipc_nl_sock_policy, NULL); if (err) return err; diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 3481e4906bd6..9df82a573aa7 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -38,6 +38,8 @@ #include <linux/sysctl.h> +static int zero; +static int one = 1; static struct ctl_table_header *tipc_ctl_hdr; static struct ctl_table tipc_table[] = { @@ -46,14 +48,16 @@ static struct ctl_table tipc_table[] = { .data = &sysctl_tipc_rmem, .maxlen = sizeof(sysctl_tipc_rmem), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "named_timeout", .data = &sysctl_tipc_named_timeout, .maxlen = sizeof(sysctl_tipc_named_timeout), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "sk_filter", diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 4d85d71f16e2..7fc02d84c4f1 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -44,7 +44,7 @@ #include <net/sock.h> #include <net/ip.h> #include <net/udp_tunnel.h> -#include <net/addrconf.h> +#include <net/ipv6_stubs.h> #include <linux/tipc_netlink.h> #include "core.h" #include "addr.h" @@ -354,25 +354,21 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) skb_pull(skb, sizeof(struct udphdr)); hdr = buf_msg(skb); - rcu_read_lock(); - b = rcu_dereference_rtnl(ub->bearer); + b = rcu_dereference(ub->bearer); if (!b) - goto rcu_out; + goto out; if (b && test_bit(0, &b->up)) { tipc_rcv(sock_net(sk), skb, b); - rcu_read_unlock(); return 0; } if (unlikely(msg_user(hdr) == LINK_CONFIG)) { err = tipc_udp_rcast_disc(b, skb); if (err) - goto rcu_out; + goto out; } -rcu_out: - rcu_read_unlock(); out: kfree_skb(skb); return 0; @@ -451,9 +447,9 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; - err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX, - attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy, NULL); + err = nla_parse_nested_deprecated(battrs, TIPC_NLA_BEARER_MAX, + attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy, NULL); if (err) return err; @@ -527,7 +523,7 @@ int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) if (!ub) return -ENODEV; - nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); + nest = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); if (!nest) goto msg_full; @@ -605,8 +601,7 @@ int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; struct udp_media_addr *dst; - if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, - tipc_nl_udp_policy, NULL)) + if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy, NULL)) return -EINVAL; if (!opts[TIPC_NLA_UDP_REMOTE]) @@ -659,9 +654,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; - if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, - attrs[TIPC_NLA_BEARER_UDP_OPTS], - tipc_nl_udp_policy, NULL)) + if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], tipc_nl_udp_policy, NULL)) goto err; if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 135a7ee9db03..ca54a7c7ec81 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,8 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - if (ctx->tx_conf == TLS_HW) + if (ctx->tx_conf == TLS_HW) { kfree(tls_offload_ctx_tx(ctx)); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + } if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); @@ -86,22 +89,6 @@ static void tls_device_gc_task(struct work_struct *work) } } -static void tls_device_attach(struct tls_context *ctx, struct sock *sk, - struct net_device *netdev) -{ - if (sk->sk_destruct != tls_device_sk_destruct) { - refcount_set(&ctx->refcount, 1); - dev_hold(netdev); - ctx->netdev = netdev; - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); - - ctx->sk_destruct = sk->sk_destruct; - sk->sk_destruct = tls_device_sk_destruct; - } -} - static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -196,7 +183,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ -void tls_device_sk_destruct(struct sock *sk) +static void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); @@ -214,7 +201,13 @@ void tls_device_sk_destruct(struct sock *sk) if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); } -EXPORT_SYMBOL(tls_device_sk_destruct); + +void tls_device_free_resources_tx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + + tls_free_partial_record(sk, tls_ctx); +} static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, @@ -548,14 +541,11 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { - int rc = 0; - if (!sk->sk_write_pending && tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; - rc = tls_push_partial_record(sk, ctx, - MSG_DONTWAIT | MSG_NOSIGNAL); + tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL); sk->sk_allocation = sk_allocation; } } @@ -574,7 +564,7 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) rx_ctx = tls_offload_ctx_rx(tls_ctx); resync_req = atomic64_read(&rx_ctx->resync_req); - req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + req_seq = (resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && @@ -587,7 +577,7 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm = strp_msg(skb); - int err = 0, offset = rxm->offset, copy, nsg; + int err = 0, offset = rxm->offset, copy, nsg, data_len, pos; struct sk_buff *skb_iter, *unused; struct scatterlist sg[1]; char *orig_buf, *buf; @@ -618,25 +608,42 @@ static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) else err = 0; - copy = min_t(int, skb_pagelen(skb) - offset, - rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + data_len = rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE; - if (skb->decrypted) - skb_store_bits(skb, offset, buf, copy); + if (skb_pagelen(skb) > offset) { + copy = min_t(int, skb_pagelen(skb) - offset, data_len); - offset += copy; - buf += copy; + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + offset += copy; + buf += copy; + } + + pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { - copy = min_t(int, skb_iter->len, - rxm->full_len - offset + rxm->offset - - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + int frag_pos; + + /* Practically all frags must belong to msg if reencrypt + * is needed with current strparser and coalescing logic, + * but strparser may "get optimized", so let's be safe. + */ + if (pos + skb_iter->len <= offset) + goto done_with_frag; + if (pos >= data_len + rxm->offset) + break; + + frag_pos = offset - pos; + copy = min_t(int, skb_iter->len - frag_pos, + data_len + rxm->offset - offset); if (skb_iter->decrypted) - skb_store_bits(skb_iter, offset, buf, copy); + skb_store_bits(skb_iter, frag_pos, buf, copy); offset += copy; buf += copy; +done_with_frag: + pos += skb_iter->len; } free_buf: @@ -672,6 +679,22 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) tls_device_reencrypt(sk, skb); } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; @@ -855,8 +878,6 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { - pr_err_ratelimited("%s: netdev %s with no TLS offload\n", - __func__, netdev->name); rc = -ENOTSUPP; goto release_netdev; } @@ -884,17 +905,16 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); - if (rc) { - pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", - __func__); + if (rc) goto free_sw_resources; - } tls_device_attach(ctx, sk, netdev); goto release_netdev; free_sw_resources: + up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); + down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_netdev: @@ -929,8 +949,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) } out: up_read(&device_offload_lock); - kfree(tls_ctx->rx.rec_seq); - kfree(tls_ctx->rx.iv); tls_sw_release_resources_rx(sk); } @@ -1015,4 +1033,5 @@ void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); flush_work(&tls_device_gc_work); + clean_acked_data_flush(); } diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 54c3a758f2a7..c3a5fe624b4e 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -194,18 +194,26 @@ static void update_chksum(struct sk_buff *skb, int headln) static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) { + struct sock *sk = skb->sk; + int delta; + skb_copy_header(nskb, skb); skb_put(nskb, skb->len); memcpy(nskb->data, skb->data, headln); - update_chksum(nskb, headln); nskb->destructor = skb->destructor; - nskb->sk = skb->sk; + nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; - refcount_add(nskb->truesize - skb->truesize, - &nskb->sk->sk_wmem_alloc); + + update_chksum(nskb, headln); + + delta = nskb->truesize - skb->truesize; + if (likely(delta < 0)) + WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); + else if (delta) + refcount_add(delta, &sk->sk_wmem_alloc); } /* This function may be called after the user socket is already diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index df921a2904b9..fc81ae18cc44 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -208,6 +208,26 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } +bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx) +{ + struct scatterlist *sg; + + sg = ctx->partially_sent_record; + if (!sg) + return false; + + while (1) { + put_page(sg_page(sg)); + sk_mem_uncharge(sk, sg->length); + + if (sg_is_last(sg)) + break; + sg++; + } + ctx->partially_sent_record = NULL; + return true; +} + static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); @@ -267,13 +287,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); tls_sw_free_resources_tx(sk); +#ifdef CONFIG_TLS_DEVICE + } else if (ctx->tx_conf == TLS_HW) { + tls_device_free_resources_tx(sk); +#endif } - if (ctx->rx_conf == TLS_SW) { - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + if (ctx->rx_conf == TLS_SW) tls_sw_free_resources_rx(sk); - } #ifdef CONFIG_TLS_DEVICE if (ctx->rx_conf == TLS_HW) @@ -469,27 +490,32 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: + optsize = sizeof(struct tls12_crypto_info_aes_gcm_128); + break; case TLS_CIPHER_AES_GCM_256: { - optsize = crypto_info->cipher_type == TLS_CIPHER_AES_GCM_128 ? - sizeof(struct tls12_crypto_info_aes_gcm_128) : - sizeof(struct tls12_crypto_info_aes_gcm_256); - if (optlen != optsize) { - rc = -EINVAL; - goto err_crypto_info; - } - rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), - optlen - sizeof(*crypto_info)); - if (rc) { - rc = -EFAULT; - goto err_crypto_info; - } + optsize = sizeof(struct tls12_crypto_info_aes_gcm_256); break; } + case TLS_CIPHER_AES_CCM_128: + optsize = sizeof(struct tls12_crypto_info_aes_ccm_128); + break; default: rc = -EINVAL; goto err_crypto_info; } + if (optlen != optsize) { + rc = -EINVAL; + goto err_crypto_info; + } + + rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info), + optlen - sizeof(*crypto_info)); + if (rc) { + rc = -EFAULT; + goto err_crypto_info; + } + if (tx) { #ifdef CONFIG_TLS_DEVICE rc = tls_set_device_offload(sk, ctx); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 425351ac2a9b..d93f83f77864 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -42,8 +42,6 @@ #include <net/strparser.h> #include <net/tls.h> -#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE - static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -121,23 +119,25 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len) } static int padding_length(struct tls_sw_context_rx *ctx, - struct tls_context *tls_ctx, struct sk_buff *skb) + struct tls_prot_info *prot, struct sk_buff *skb) { struct strp_msg *rxm = strp_msg(skb); int sub = 0; /* Determine zero-padding length */ - if (tls_ctx->prot_info.version == TLS_1_3_VERSION) { + if (prot->version == TLS_1_3_VERSION) { char content_type = 0; int err; int back = 17; while (content_type == 0) { - if (back > rxm->full_len) + if (back > rxm->full_len - prot->prepend_size) return -EBADMSG; err = skb_copy_bits(skb, rxm->offset + rxm->full_len - back, &content_type, 1); + if (err) + return err; if (content_type) break; sub++; @@ -172,9 +172,17 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) tls_err_abort(skb->sk, err); } else { struct strp_msg *rxm = strp_msg(skb); - rxm->full_len -= padding_length(ctx, tls_ctx, skb); - rxm->offset += prot->prepend_size; - rxm->full_len -= prot->overhead_size; + int pad; + + pad = padding_length(ctx, prot, skb); + if (pad < 0) { + ctx->async_wait.err = pad; + tls_err_abort(skb->sk, pad); + } else { + rxm->full_len -= pad; + rxm->offset += prot->prepend_size; + rxm->full_len -= prot->overhead_size; + } } /* After using skb->sk to propagate sk through crypto async callback @@ -225,7 +233,7 @@ static int tls_do_decryption(struct sock *sk, /* Using skb->sk to push sk through to crypto async callback * handler. This allows propagating errors up to the socket * if needed. It _must_ be cleared in the async handler - * before kfree_skb is called. We _know_ skb->sk is NULL + * before consume_skb is called. We _know_ skb->sk is NULL * because it is a clone from strparser. */ skb->sk = sk; @@ -479,11 +487,18 @@ static int tls_do_encryption(struct sock *sk, struct tls_rec *rec = ctx->open_rec; struct sk_msg *msg_en = &rec->msg_encrypted; struct scatterlist *sge = sk_msg_elem(msg_en, start); - int rc; + int rc, iv_offset = 0; + + /* For CCM based ciphers, first byte of IV is a constant */ + if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; + iv_offset = 1; + } - memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data)); - xor_iv_with_seq(prot->version, rec->iv_data, - tls_ctx->tx.rec_seq); + memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, + prot->iv_size + prot->salt_size); + + xor_iv_with_seq(prot->version, rec->iv_data, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -1344,6 +1359,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout = NULL; const int data_len = rxm->full_len - prot->overhead_size + prot->tail_size; + int iv_offset = 0; if (*zc && (out_iov || out_sg)) { if (out_iov) @@ -1386,18 +1402,25 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, aad = (u8 *)(sgout + n_sgout); iv = aad + prot->aad_size; + /* For CCM based ciphers, first byte of nonce+iv is always '2' */ + if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + iv[0] = 2; + iv_offset = 1; + } + /* Prepare IV */ err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + iv + iv_offset + prot->salt_size, prot->iv_size); if (err < 0) { kfree(mem); return err; } if (prot->version == TLS_1_3_VERSION) - memcpy(iv, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv)); + memcpy(iv + iv_offset, tls_ctx->rx.iv, + crypto_aead_ivsize(ctx->aead_recv)); else - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq); @@ -1465,7 +1488,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, struct tls_prot_info *prot = &tls_ctx->prot_info; int version = prot->version; struct strp_msg *rxm = strp_msg(skb); - int err = 0; + int pad, err = 0; if (!ctx->decrypted) { #ifdef CONFIG_TLS_DEVICE @@ -1484,9 +1507,15 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, return err; } + } else { + *zc = false; } - rxm->full_len -= padding_length(ctx, tls_ctx, skb); + pad = padding_length(ctx, prot, skb); + if (pad < 0) + return pad; + + rxm->full_len -= pad; rxm->offset += prot->prepend_size; rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, &tls_ctx->rx, version); @@ -1522,7 +1551,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, rxm->full_len -= len; return false; } - kfree_skb(skb); + consume_skb(skb); } /* Finished with message */ @@ -1631,7 +1660,7 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, if (!is_peek) { skb_unlink(skb, &ctx->rx_list); - kfree_skb(skb); + consume_skb(skb); } skb = next_skb; @@ -2050,20 +2079,7 @@ void tls_sw_free_resources_tx(struct sock *sk) /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ - if (tls_ctx->partially_sent_record) { - struct scatterlist *sg = tls_ctx->partially_sent_record; - - while (1) { - put_page(sg_page(sg)); - sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; - } - - tls_ctx->partially_sent_record = NULL; - + if (tls_free_partial_record(sk, tls_ctx)) { rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); @@ -2089,6 +2105,9 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + if (ctx->aead_recv) { kfree_skb(ctx->recv_pkt); ctx->recv_pkt = NULL; @@ -2152,14 +2171,15 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; + struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; struct crypto_aead **aead; struct strp_callbacks cb; - u16 nonce_size, tag_size, iv_size, rec_seq_size; + u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct crypto_tfm *tfm; - char *iv, *rec_seq, *key, *salt; + char *iv, *rec_seq, *key, *salt, *cipher_name; size_t keysize; int rc = 0; @@ -2224,6 +2244,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) keysize = TLS_CIPHER_AES_GCM_128_KEY_SIZE; key = gcm_128_info->key; salt = gcm_128_info->salt; + salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; + cipher_name = "gcm(aes)"; break; } case TLS_CIPHER_AES_GCM_256: { @@ -2239,6 +2261,25 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) keysize = TLS_CIPHER_AES_GCM_256_KEY_SIZE; key = gcm_256_info->key; salt = gcm_256_info->salt; + salt_size = TLS_CIPHER_AES_GCM_256_SALT_SIZE; + cipher_name = "gcm(aes)"; + break; + } + case TLS_CIPHER_AES_CCM_128: { + nonce_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; + tag_size = TLS_CIPHER_AES_CCM_128_TAG_SIZE; + iv_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; + iv = ((struct tls12_crypto_info_aes_ccm_128 *)crypto_info)->iv; + rec_seq_size = TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE; + rec_seq = + ((struct tls12_crypto_info_aes_ccm_128 *)crypto_info)->rec_seq; + ccm_128_info = + (struct tls12_crypto_info_aes_ccm_128 *)crypto_info; + keysize = TLS_CIPHER_AES_CCM_128_KEY_SIZE; + key = ccm_128_info->key; + salt = ccm_128_info->salt; + salt_size = TLS_CIPHER_AES_CCM_128_SALT_SIZE; + cipher_name = "ccm(aes)"; break; } default: @@ -2268,16 +2309,16 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) prot->overhead_size = prot->prepend_size + prot->tag_size + prot->tail_size; prot->iv_size = iv_size; - cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + prot->salt_size = salt_size; + cctx->iv = kmalloc(iv_size + salt_size, GFP_KERNEL); if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } /* Note: 128 & 256 bit salt are the same size */ - memcpy(cctx->iv, salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); prot->rec_seq_size = rec_seq_size; + memcpy(cctx->iv, salt, salt_size); + memcpy(cctx->iv + salt_size, iv, iv_size); cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); if (!cctx->rec_seq) { rc = -ENOMEM; @@ -2285,7 +2326,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (!*aead) { - *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + *aead = crypto_alloc_aead(cipher_name, 0, 0); if (IS_ERR(*aead)) { rc = PTR_ERR(*aead); *aead = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ddb838a1b74c..e68d7454f2e3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2040,8 +2040,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; + int skip; int err; - int peeked, skip; err = -EOPNOTSUPP; if (flags&MSG_OOB) @@ -2053,8 +2053,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, mutex_lock(&u->iolock); skip = sk_peek_offset(sk, flags); - skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip, - &err, &last); + skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err, + &last); if (skb) break; diff --git a/net/wimax/stack.c b/net/wimax/stack.c index a6307813b6d5..4969de672886 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -419,26 +419,26 @@ static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = { static const struct genl_ops wimax_gnl_ops[] = { { .cmd = WIMAX_GNL_OP_MSG_FROM_USER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = wimax_gnl_policy, .doit = wimax_gnl_doit_msg_from_user, }, { .cmd = WIMAX_GNL_OP_RESET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = wimax_gnl_policy, .doit = wimax_gnl_doit_reset, }, { .cmd = WIMAX_GNL_OP_RFKILL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = wimax_gnl_policy, .doit = wimax_gnl_doit_rfkill, }, { .cmd = WIMAX_GNL_OP_STATE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, - .policy = wimax_gnl_policy, .doit = wimax_gnl_doit_state_get, }, }; @@ -582,6 +582,7 @@ struct genl_family wimax_gnl_family __ro_after_init = { .version = WIMAX_GNL_VERSION, .hdrsize = 0, .maxattr = WIMAX_GNL_ATTR_MAX, + .policy = wimax_gnl_policy, .module = THIS_MODULE, .ops = wimax_gnl_ops, .n_ops = ARRAY_SIZE(wimax_gnl_ops), diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 35f06563207d..11eaa5956f00 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -501,7 +501,6 @@ static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, } desc->tfm = tfm_michael; - desc->flags = 0; if (crypto_shash_setkey(tfm_michael, key, 8)) return -1; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 25a9e3b5c154..fffe4b371e23 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -331,6 +331,11 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = NL80211_MAX_SUPP_RATES }, [NL80211_ATTR_STA_PLINK_ACTION] = NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1), + [NL80211_ATTR_STA_TX_POWER_SETTING] = + NLA_POLICY_RANGE(NLA_U8, + NL80211_TX_POWER_AUTOMATIC, + NL80211_TX_POWER_FIXED), + [NL80211_ATTR_STA_TX_POWER] = { .type = NLA_S16 }, [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 }, [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ }, [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY, @@ -553,6 +558,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1), [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED }, + [NL80211_KEY_MODE] = NLA_POLICY_RANGE(NLA_U8, 0, NL80211_KEY_SET_TX), }; /* policy for the key default flags */ @@ -618,11 +624,20 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { }; static const struct nla_policy +nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { + [NL80211_BAND_2GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, +}; + +static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { .len = ETH_ALEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, + [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = + NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), }; static const struct nla_policy @@ -688,9 +703,11 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb, int err; if (!cb->args[0]) { - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - genl_family_attrbuf(&nl80211_fam), - nl80211_fam.maxattr, nl80211_policy, NULL); + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl80211_fam.hdrsize, + genl_family_attrbuf(&nl80211_fam), + nl80211_fam.maxattr, + nl80211_policy, NULL); if (err) return err; @@ -740,13 +757,13 @@ static int nl80211_msg_put_wmm_rules(struct sk_buff *msg, { int j; struct nlattr *nl_wmm_rules = - nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM); + nla_nest_start_noflag(msg, NL80211_FREQUENCY_ATTR_WMM); if (!nl_wmm_rules) goto nla_put_failure; for (j = 0; j < IEEE80211_NUM_ACS; j++) { - struct nlattr *nl_wmm_rule = nla_nest_start(msg, j); + struct nlattr *nl_wmm_rule = nla_nest_start_noflag(msg, j); if (!nl_wmm_rule) goto nla_put_failure; @@ -875,7 +892,7 @@ static bool nl80211_put_txq_stats(struct sk_buff *msg, return false; \ } while (0) - txqattr = nla_nest_start(msg, attrtype); + txqattr = nla_nest_start_noflag(msg, attrtype); if (!txqattr) return false; @@ -910,8 +927,9 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, struct key_parse *k) { struct nlattr *tb[NL80211_KEY_MAX + 1]; - int err = nla_parse_nested(tb, NL80211_KEY_MAX, key, - nl80211_key_policy, info->extack); + int err = nla_parse_nested_deprecated(tb, NL80211_KEY_MAX, key, + nl80211_key_policy, + info->extack); if (err) return err; @@ -947,10 +965,11 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, if (tb[NL80211_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; - err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, - tb[NL80211_KEY_DEFAULT_TYPES], - nl80211_key_default_policy, - info->extack); + err = nla_parse_nested_deprecated(kdt, + NUM_NL80211_KEY_DEFAULT_TYPES - 1, + tb[NL80211_KEY_DEFAULT_TYPES], + nl80211_key_default_policy, + info->extack); if (err) return err; @@ -958,6 +977,9 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key, k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST]; } + if (tb[NL80211_KEY_MODE]) + k->p.mode = nla_get_u8(tb[NL80211_KEY_MODE]); + return 0; } @@ -994,11 +1016,11 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; - int err = nla_parse_nested(kdt, - NUM_NL80211_KEY_DEFAULT_TYPES - 1, - info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES], - nl80211_key_default_policy, - info->extack); + int err = nla_parse_nested_deprecated(kdt, + NUM_NL80211_KEY_DEFAULT_TYPES - 1, + info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES], + nl80211_key_default_policy, + info->extack); if (err) return err; @@ -1187,7 +1209,7 @@ static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes) { - struct nlattr *nl_modes = nla_nest_start(msg, attr); + struct nlattr *nl_modes = nla_nest_start_noflag(msg, attr); int i; if (!nl_modes) @@ -1215,8 +1237,8 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy, struct nlattr *nl_combis; int i, j; - nl_combis = nla_nest_start(msg, - NL80211_ATTR_INTERFACE_COMBINATIONS); + nl_combis = nla_nest_start_noflag(msg, + NL80211_ATTR_INTERFACE_COMBINATIONS); if (!nl_combis) goto nla_put_failure; @@ -1226,18 +1248,19 @@ static int nl80211_put_iface_combinations(struct wiphy *wiphy, c = &wiphy->iface_combinations[i]; - nl_combi = nla_nest_start(msg, i + 1); + nl_combi = nla_nest_start_noflag(msg, i + 1); if (!nl_combi) goto nla_put_failure; - nl_limits = nla_nest_start(msg, NL80211_IFACE_COMB_LIMITS); + nl_limits = nla_nest_start_noflag(msg, + NL80211_IFACE_COMB_LIMITS); if (!nl_limits) goto nla_put_failure; for (j = 0; j < c->n_limits; j++) { struct nlattr *nl_limit; - nl_limit = nla_nest_start(msg, j + 1); + nl_limit = nla_nest_start_noflag(msg, j + 1); if (!nl_limit) goto nla_put_failure; if (nla_put_u32(msg, NL80211_IFACE_LIMIT_MAX, @@ -1290,7 +1313,8 @@ static int nl80211_send_wowlan_tcp_caps(struct cfg80211_registered_device *rdev, if (!tcp) return 0; - nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION); + nl_tcp = nla_nest_start_noflag(msg, + NL80211_WOWLAN_TRIG_TCP_CONNECTION); if (!nl_tcp) return -ENOBUFS; @@ -1330,7 +1354,8 @@ static int nl80211_send_wowlan(struct sk_buff *msg, if (!rdev->wiphy.wowlan) return 0; - nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); + nl_wowlan = nla_nest_start_noflag(msg, + NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED); if (!nl_wowlan) return -ENOBUFS; @@ -1459,7 +1484,8 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, if (sband->n_iftype_data) { struct nlattr *nl_iftype_data = - nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA); + nla_nest_start_noflag(msg, + NL80211_BAND_ATTR_IFTYPE_DATA); int err; if (!nl_iftype_data) @@ -1468,7 +1494,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, for (i = 0; i < sband->n_iftype_data; i++) { struct nlattr *iftdata; - iftdata = nla_nest_start(msg, i + 1); + iftdata = nla_nest_start_noflag(msg, i + 1); if (!iftdata) return -ENOBUFS; @@ -1484,12 +1510,12 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, } /* add bitrates */ - nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); + nl_rates = nla_nest_start_noflag(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) return -ENOBUFS; for (i = 0; i < sband->n_bitrates; i++) { - nl_rate = nla_nest_start(msg, i); + nl_rate = nla_nest_start_noflag(msg, i); if (!nl_rate) return -ENOBUFS; @@ -1522,12 +1548,12 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg, if (!mgmt_stypes) return 0; - nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); + nl_ifs = nla_nest_start_noflag(msg, NL80211_ATTR_TX_FRAME_TYPES); if (!nl_ifs) return -ENOBUFS; for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { - nl_ftypes = nla_nest_start(msg, ift); + nl_ftypes = nla_nest_start_noflag(msg, ift); if (!nl_ftypes) return -ENOBUFS; i = 0; @@ -1545,12 +1571,12 @@ nl80211_send_mgmt_stypes(struct sk_buff *msg, nla_nest_end(msg, nl_ifs); - nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); + nl_ifs = nla_nest_start_noflag(msg, NL80211_ATTR_RX_FRAME_TYPES); if (!nl_ifs) return -ENOBUFS; for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { - nl_ftypes = nla_nest_start(msg, ift); + nl_ftypes = nla_nest_start_noflag(msg, ift); if (!nl_ftypes) return -ENOBUFS; i = 0; @@ -1668,7 +1694,7 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, if (!cap->ftm.supported) return 0; - ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM); + ftm = nla_nest_start_noflag(msg, NL80211_PMSR_TYPE_FTM); if (!ftm) return -ENOBUFS; @@ -1716,7 +1742,7 @@ static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev, * will genlmsg_cancel() if we fail */ - pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); + pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) return -ENOBUFS; @@ -1731,7 +1757,7 @@ static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev, nla_put_flag(msg, NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR)) return -ENOBUFS; - caps = nla_nest_start(msg, NL80211_PMSR_ATTR_TYPE_CAPA); + caps = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_TYPE_CAPA); if (!caps) return -ENOBUFS; @@ -1892,7 +1918,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, break; /* fall through */ case 3: - nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS); + nl_bands = nla_nest_start_noflag(msg, + NL80211_ATTR_WIPHY_BANDS); if (!nl_bands) goto nla_put_failure; @@ -1905,7 +1932,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (!sband) continue; - nl_band = nla_nest_start(msg, band); + nl_band = nla_nest_start_noflag(msg, band); if (!nl_band) goto nla_put_failure; @@ -1919,15 +1946,16 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, /* fall through */ default: /* add frequencies */ - nl_freqs = nla_nest_start( - msg, NL80211_BAND_ATTR_FREQS); + nl_freqs = nla_nest_start_noflag(msg, + NL80211_BAND_ATTR_FREQS); if (!nl_freqs) goto nla_put_failure; for (i = state->chan_start - 1; i < sband->n_channels; i++) { - nl_freq = nla_nest_start(msg, i); + nl_freq = nla_nest_start_noflag(msg, + i); if (!nl_freq) goto nla_put_failure; @@ -1972,7 +2000,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, break; /* fall through */ case 4: - nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS); + nl_cmds = nla_nest_start_noflag(msg, + NL80211_ATTR_SUPPORTED_COMMANDS); if (!nl_cmds) goto nla_put_failure; @@ -2120,7 +2149,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, const struct nl80211_vendor_cmd_info *info; struct nlattr *nested; - nested = nla_nest_start(msg, NL80211_ATTR_VENDOR_DATA); + nested = nla_nest_start_noflag(msg, + NL80211_ATTR_VENDOR_DATA); if (!nested) goto nla_put_failure; @@ -2136,8 +2166,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, const struct nl80211_vendor_cmd_info *info; struct nlattr *nested; - nested = nla_nest_start(msg, - NL80211_ATTR_VENDOR_EVENTS); + nested = nla_nest_start_noflag(msg, + NL80211_ATTR_VENDOR_EVENTS); if (!nested) goto nla_put_failure; @@ -2174,7 +2204,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; - nested = nla_nest_start(msg, NL80211_ATTR_BSS_SELECT); + nested = nla_nest_start_noflag(msg, + NL80211_ATTR_BSS_SELECT); if (!nested) goto nla_put_failure; @@ -2196,8 +2227,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.iftype_ext_capab) { struct nlattr *nested_ext_capab, *nested; - nested = nla_nest_start(msg, - NL80211_ATTR_IFTYPE_EXT_CAPA); + nested = nla_nest_start_noflag(msg, + NL80211_ATTR_IFTYPE_EXT_CAPA); if (!nested) goto nla_put_failure; @@ -2207,7 +2238,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, capab = &rdev->wiphy.iftype_ext_capab[i]; - nested_ext_capab = nla_nest_start(msg, i); + nested_ext_capab = nla_nest_start_noflag(msg, + i); if (!nested_ext_capab || nla_put_u32(msg, NL80211_ATTR_IFTYPE, capab->iftype) || @@ -2289,8 +2321,10 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, struct nl80211_dump_wiphy_state *state) { struct nlattr **tb = genl_family_attrbuf(&nl80211_fam); - int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, tb, - nl80211_fam.maxattr, nl80211_policy, NULL); + int ret = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl80211_fam.hdrsize, + tb, nl80211_fam.maxattr, + nl80211_policy, NULL); /* ignore parse errors for backward compatibility */ if (ret) return 0; @@ -2733,10 +2767,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], rem_txq_params) { - result = nla_parse_nested(tb, NL80211_TXQ_ATTR_MAX, - nl_txq_params, - txq_params_policy, - info->extack); + result = nla_parse_nested_deprecated(tb, + NL80211_TXQ_ATTR_MAX, + nl_txq_params, + txq_params_policy, + info->extack); if (result) return result; result = parse_txq_params(tb, &txq_params); @@ -3193,8 +3228,7 @@ static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) if (!nla) return -EINVAL; - if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX, nla, - mntr_flags_policy, NULL)) + if (nla_parse_nested_deprecated(flags, NL80211_MNTR_FLAG_MAX, nla, mntr_flags_policy, NULL)) return -EINVAL; for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++) @@ -3521,7 +3555,7 @@ static void get_key_callback(void *c, struct key_params *params) params->cipher))) goto nla_put_failure; - key = nla_nest_start(cookie->msg, NL80211_ATTR_KEY); + key = nla_nest_start_noflag(cookie->msg, NL80211_ATTR_KEY); if (!key) goto nla_put_failure; @@ -3634,8 +3668,11 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (key.idx < 0) return -EINVAL; - /* only support setting default key */ - if (!key.def && !key.defmgmt) + /* Only support setting default key and + * Extended Key ID action NL80211_KEY_SET_TX. + */ + if (!key.def && !key.defmgmt && + !(key.p.mode == NL80211_KEY_SET_TX)) return -EINVAL; wdev_lock(dev->ieee80211_ptr); @@ -3659,7 +3696,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_CFG80211_WEXT dev->ieee80211_ptr->wext.default_key = key.idx; #endif - } else { + } else if (key.defmgmt) { if (key.def_uni || !key.def_multi) { err = -EINVAL; goto out; @@ -3681,8 +3718,25 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #ifdef CONFIG_CFG80211_WEXT dev->ieee80211_ptr->wext.default_mgmt_key = key.idx; #endif - } + } else if (key.p.mode == NL80211_KEY_SET_TX && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) { + u8 *mac_addr = NULL; + + if (info->attrs[NL80211_ATTR_MAC]) + mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!mac_addr || key.idx < 0 || key.idx > 1) { + err = -EINVAL; + goto out; + } + err = rdev_add_key(rdev, dev, key.idx, + NL80211_KEYTYPE_PAIRWISE, + mac_addr, &key.p); + } else { + err = -EINVAL; + } out: wdev_unlock(dev->ieee80211_ptr); @@ -3843,8 +3897,7 @@ static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy, if (n_entries > wiphy->max_acl_mac_addrs) return ERR_PTR(-ENOTSUPP); - acl = kzalloc(sizeof(*acl) + (sizeof(struct mac_address) * n_entries), - GFP_KERNEL); + acl = kzalloc(struct_size(acl, mac_addrs, n_entries), GFP_KERNEL); if (!acl) return ERR_PTR(-ENOMEM); @@ -4054,8 +4107,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, sband = rdev->wiphy.bands[band]; if (sband == NULL) return -EINVAL; - err = nla_parse_nested(tb, NL80211_TXRATE_MAX, tx_rates, - nl80211_txattr_policy, info->extack); + err = nla_parse_nested_deprecated(tb, NL80211_TXRATE_MAX, + tx_rates, + nl80211_txattr_policy, + info->extack); if (err) return err; if (tb[NL80211_TXRATE_LEGACY]) { @@ -4223,9 +4278,10 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, if (attrs[NL80211_ATTR_FTM_RESPONDER]) { struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1]; - err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX, - attrs[NL80211_ATTR_FTM_RESPONDER], - NULL, NULL); + err = nla_parse_nested_deprecated(tb, + NL80211_FTM_RESP_ATTR_MAX, + attrs[NL80211_ATTR_FTM_RESPONDER], + NULL, NULL); if (err) return err; @@ -4633,8 +4689,7 @@ static int parse_station_flags(struct genl_info *info, if (!nla) return 0; - if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX, nla, - sta_flags_policy, info->extack)) + if (nla_parse_nested_deprecated(flags, NL80211_STA_FLAG_MAX, nla, sta_flags_policy, info->extack)) return -EINVAL; /* @@ -4686,7 +4741,7 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) u16 bitrate_compat; enum nl80211_rate_info rate_flg; - rate = nla_nest_start(msg, attr); + rate = nla_nest_start_noflag(msg, attr); if (!rate) return false; @@ -4773,7 +4828,7 @@ static bool nl80211_put_signal(struct sk_buff *msg, u8 mask, s8 *signal, if (!mask) return true; - attr = nla_nest_start(msg, id); + attr = nla_nest_start_noflag(msg, id); if (!attr) return false; @@ -4808,7 +4863,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, nla_put_u32(msg, NL80211_ATTR_GENERATION, sinfo->generation)) goto nla_put_failure; - sinfoattr = nla_nest_start(msg, NL80211_ATTR_STA_INFO); + sinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_STA_INFO); if (!sinfoattr) goto nla_put_failure; @@ -4889,6 +4944,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(TX_RETRIES, tx_retries, u32); PUT_SINFO(TX_FAILED, tx_failed, u32); PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32); + PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32); PUT_SINFO(LOCAL_PM, local_pm, u32); PUT_SINFO(PEER_PM, peer_pm, u32); @@ -4896,7 +4952,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { - bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); + bss_param = nla_nest_start_noflag(msg, + NL80211_STA_INFO_BSS_PARAM); if (!bss_param) goto nla_put_failure; @@ -4939,7 +4996,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, struct nlattr *tidsattr; int tid; - tidsattr = nla_nest_start(msg, NL80211_STA_INFO_TID_STATS); + tidsattr = nla_nest_start_noflag(msg, + NL80211_STA_INFO_TID_STATS); if (!tidsattr) goto nla_put_failure; @@ -4952,7 +5010,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, if (!tidstats->filled) continue; - tidattr = nla_nest_start(msg, tid + 1); + tidattr = nla_nest_start_noflag(msg, tid + 1); if (!tidattr) goto nla_put_failure; @@ -5300,8 +5358,9 @@ static int nl80211_parse_sta_wme(struct genl_info *info, return 0; nla = info->attrs[NL80211_ATTR_STA_WME]; - err = nla_parse_nested(tb, NL80211_STA_WME_MAX, nla, - nl80211_sta_wme_policy, info->extack); + err = nla_parse_nested_deprecated(tb, NL80211_STA_WME_MAX, nla, + nl80211_sta_wme_policy, + info->extack); if (err) return err; @@ -5387,6 +5446,36 @@ static int nl80211_set_station_tdls(struct genl_info *info, return nl80211_parse_sta_wme(info, params); } +static int nl80211_parse_sta_txpower_setting(struct genl_info *info, + struct station_parameters *params) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int idx; + + if (info->attrs[NL80211_ATTR_STA_TX_POWER_SETTING]) { + if (!rdev->ops->set_tx_power || + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_STA_TX_PWR)) + return -EOPNOTSUPP; + + idx = NL80211_ATTR_STA_TX_POWER_SETTING; + params->txpwr.type = nla_get_u8(info->attrs[idx]); + + if (params->txpwr.type == NL80211_TX_POWER_LIMITED) { + idx = NL80211_ATTR_STA_TX_POWER; + + if (info->attrs[idx]) + params->txpwr.power = + nla_get_s16(info->attrs[idx]); + else + return -EINVAL; + } + params->sta_modify_mask |= STATION_PARAM_APPLY_STA_TXPOWER; + } + + return 0; +} + static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -5480,6 +5569,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; + err = nl80211_parse_sta_txpower_setting(info, ¶ms); + if (err) + return err; + /* Include parameters for TDLS peer (will check later) */ err = nl80211_set_station_tdls(info, ¶ms); if (err) @@ -5617,6 +5710,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) return -EOPNOTSUPP; + err = nl80211_parse_sta_txpower_setting(info, ¶ms); + if (err) + return err; + err = nl80211_parse_sta_channel_info(info, ¶ms); if (err) return err; @@ -5799,7 +5896,7 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u32(msg, NL80211_ATTR_GENERATION, pinfo->generation)) goto nla_put_failure; - pinfoattr = nla_nest_start(msg, NL80211_ATTR_MPATH_INFO); + pinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_MPATH_INFO); if (!pinfoattr) goto nla_put_failure; if ((pinfo->filled & MPATH_INFO_FRAME_QLEN) && @@ -6250,7 +6347,7 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, NL80211_CMD_GET_MESH_CONFIG); if (!hdr) goto out; - pinfoattr = nla_nest_start(msg, NL80211_ATTR_MESH_CONFIG); + pinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_MESH_CONFIG); if (!pinfoattr) goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || @@ -6403,9 +6500,7 @@ do { \ if (!info->attrs[NL80211_ATTR_MESH_CONFIG]) return -EINVAL; - if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX, - info->attrs[NL80211_ATTR_MESH_CONFIG], - nl80211_meshconf_params_policy, info->extack)) + if (nla_parse_nested_deprecated(tb, NL80211_MESHCONF_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_CONFIG], nl80211_meshconf_params_policy, info->extack)) return -EINVAL; /* This makes sure that there aren't more than 32 mesh config @@ -6538,9 +6633,7 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, if (!info->attrs[NL80211_ATTR_MESH_SETUP]) return -EINVAL; - if (nla_parse_nested(tb, NL80211_MESH_SETUP_ATTR_MAX, - info->attrs[NL80211_ATTR_MESH_SETUP], - nl80211_mesh_setup_params_policy, info->extack)) + if (nla_parse_nested_deprecated(tb, NL80211_MESH_SETUP_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_SETUP], nl80211_mesh_setup_params_policy, info->extack)) return -EINVAL; if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC]) @@ -6629,7 +6722,7 @@ static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom, nla_put_u8(msg, NL80211_ATTR_DFS_REGION, regdom->dfs_region))) goto nla_put_failure; - nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES); + nl_reg_rules = nla_nest_start_noflag(msg, NL80211_ATTR_REG_RULES); if (!nl_reg_rules) goto nla_put_failure; @@ -6644,7 +6737,7 @@ static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom, freq_range = ®_rule->freq_range; power_rule = ®_rule->power_rule; - nl_reg_rule = nla_nest_start(msg, i); + nl_reg_rule = nla_nest_start_noflag(msg, i); if (!nl_reg_rule) goto nla_put_failure; @@ -6882,7 +6975,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) struct nlattr *nl_reg_rule; char *alpha2; int rem_reg_rules, r; - u32 num_rules = 0, rule_idx = 0, size_of_regd; + u32 num_rules = 0, rule_idx = 0; enum nl80211_dfs_regions dfs_region = NL80211_DFS_UNSET; struct ieee80211_regdomain *rd; @@ -6907,10 +7000,7 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) if (!reg_is_valid_request(alpha2)) return -EINVAL; - size_of_regd = sizeof(struct ieee80211_regdomain) + - num_rules * sizeof(struct ieee80211_reg_rule); - - rd = kzalloc(size_of_regd, GFP_KERNEL); + rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); if (!rd) return -ENOMEM; @@ -6927,9 +7017,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], rem_reg_rules) { - r = nla_parse_nested(tb, NL80211_REG_RULE_ATTR_MAX, - nl_reg_rule, reg_rule_policy, - info->extack); + r = nla_parse_nested_deprecated(tb, NL80211_REG_RULE_ATTR_MAX, + nl_reg_rule, reg_rule_policy, + info->extack); if (r) goto bad_reg; r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); @@ -7000,8 +7090,9 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, if (!nla_ok(nest, nla_len(nest))) return -EINVAL; - err = nla_parse_nested(attr, NL80211_BSS_SELECT_ATTR_MAX, nest, - nl80211_bss_select_policy, NULL); + err = nla_parse_nested_deprecated(attr, NL80211_BSS_SELECT_ATTR_MAX, + nest, nl80211_bss_select_policy, + NULL); if (err) return err; @@ -7494,8 +7585,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, if (WARN_ON(i >= n_plans)) return -EINVAL; - err = nla_parse_nested(plan, NL80211_SCHED_SCAN_PLAN_MAX, - attr, nl80211_plan_policy, NULL); + err = nla_parse_nested_deprecated(plan, + NL80211_SCHED_SCAN_PLAN_MAX, + attr, nl80211_plan_policy, + NULL); if (err) return err; @@ -7537,6 +7630,41 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, return 0; } +static int +nl80211_parse_sched_scan_per_band_rssi(struct wiphy *wiphy, + struct cfg80211_match_set *match_sets, + struct nlattr *tb_band_rssi, + s32 rssi_thold) +{ + struct nlattr *attr; + int i, tmp, ret = 0; + + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD)) { + if (tb_band_rssi) + ret = -EOPNOTSUPP; + else + for (i = 0; i < NUM_NL80211_BANDS; i++) + match_sets->per_band_rssi_thold[i] = + NL80211_SCAN_RSSI_THOLD_OFF; + return ret; + } + + for (i = 0; i < NUM_NL80211_BANDS; i++) + match_sets->per_band_rssi_thold[i] = rssi_thold; + + nla_for_each_nested(attr, tb_band_rssi, tmp) { + enum nl80211_band band = nla_type(attr); + + if (band < 0 || band >= NUM_NL80211_BANDS) + return -EINVAL; + + match_sets->per_band_rssi_thold[band] = nla_get_s32(attr); + } + + return 0; +} + static struct cfg80211_sched_scan_request * nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr **attrs, int max_match_sets) @@ -7581,10 +7709,11 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, tmp) { struct nlattr *rssi; - err = nla_parse_nested(tb, - NL80211_SCHED_SCAN_MATCH_ATTR_MAX, - attr, nl80211_match_policy, - NULL); + err = nla_parse_nested_deprecated(tb, + NL80211_SCHED_SCAN_MATCH_ATTR_MAX, + attr, + nl80211_match_policy, + NULL); if (err) return ERR_PTR(err); @@ -7768,51 +7897,64 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, tmp) { struct nlattr *ssid, *bssid, *rssi; - err = nla_parse_nested(tb, - NL80211_SCHED_SCAN_MATCH_ATTR_MAX, - attr, nl80211_match_policy, - NULL); + err = nla_parse_nested_deprecated(tb, + NL80211_SCHED_SCAN_MATCH_ATTR_MAX, + attr, + nl80211_match_policy, + NULL); if (err) goto out_free; ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]; bssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]; - if (ssid || bssid) { - if (WARN_ON(i >= n_match_sets)) { - /* this indicates a programming error, - * the loop above should have verified - * things properly - */ + + if (!ssid && !bssid) { + i++; + continue; + } + + if (WARN_ON(i >= n_match_sets)) { + /* this indicates a programming error, + * the loop above should have verified + * things properly + */ + err = -EINVAL; + goto out_free; + } + + if (ssid) { + if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { err = -EINVAL; goto out_free; } - - if (ssid) { - if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out_free; - } - memcpy(request->match_sets[i].ssid.ssid, - nla_data(ssid), nla_len(ssid)); - request->match_sets[i].ssid.ssid_len = - nla_len(ssid); - } - if (bssid) { - if (nla_len(bssid) != ETH_ALEN) { - err = -EINVAL; - goto out_free; - } - memcpy(request->match_sets[i].bssid, - nla_data(bssid), ETH_ALEN); + memcpy(request->match_sets[i].ssid.ssid, + nla_data(ssid), nla_len(ssid)); + request->match_sets[i].ssid.ssid_len = + nla_len(ssid); + } + if (bssid) { + if (nla_len(bssid) != ETH_ALEN) { + err = -EINVAL; + goto out_free; } + memcpy(request->match_sets[i].bssid, + nla_data(bssid), ETH_ALEN); + } - /* special attribute - old implementation w/a */ + /* special attribute - old implementation w/a */ + request->match_sets[i].rssi_thold = default_match_rssi; + rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; + if (rssi) request->match_sets[i].rssi_thold = - default_match_rssi; - rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI]; - if (rssi) - request->match_sets[i].rssi_thold = - nla_get_s32(rssi); - } + nla_get_s32(rssi); + + /* Parse per band RSSI attribute */ + err = nl80211_parse_sched_scan_per_band_rssi(wiphy, + &request->match_sets[i], + tb[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI], + request->match_sets[i].rssi_thold); + if (err) + goto out_free; + i++; } @@ -8061,7 +8203,7 @@ static int nl80211_notify_radar_detection(struct sk_buff *skb, cfg80211_sched_dfs_chan_update(rdev); - memcpy(&rdev->radar_chandef, &chandef, sizeof(chandef)); + rdev->radar_chandef = chandef; /* Propagate this notification to other radios as well */ queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); @@ -8143,9 +8285,9 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX, - info->attrs[NL80211_ATTR_CSA_IES], - nl80211_policy, info->extack); + err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX, + info->attrs[NL80211_ATTR_CSA_IES], + nl80211_policy, info->extack); if (err) return err; @@ -8269,7 +8411,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, NL80211_ATTR_PAD)) goto nla_put_failure; - bss = nla_nest_start(msg, NL80211_ATTR_BSS); + bss = nla_nest_start_noflag(msg, NL80211_ATTR_BSS); if (!bss) goto nla_put_failure; if ((!is_zero_ether_addr(res->bssid) && @@ -8446,7 +8588,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - infoattr = nla_nest_start(msg, NL80211_ATTR_SURVEY_INFO); + infoattr = nla_nest_start_noflag(msg, NL80211_ATTR_SURVEY_INFO); if (!infoattr) goto nla_put_failure; @@ -9287,7 +9429,7 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, goto nla_put_failure; } - data = nla_nest_start(skb, attr); + data = nla_nest_start_noflag(skb, attr); if (!data) goto nla_put_failure; @@ -9420,9 +9562,10 @@ static int nl80211_testmode_dump(struct sk_buff *skb, } else { struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - attrbuf, nl80211_fam.maxattr, - nl80211_policy, NULL); + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl80211_fam.hdrsize, + attrbuf, nl80211_fam.maxattr, + nl80211_policy, NULL); if (err) goto out_err; @@ -9461,7 +9604,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb, break; } - tmdata = nla_nest_start(skb, NL80211_ATTR_TESTDATA); + tmdata = nla_nest_start_noflag(skb, NL80211_ATTR_TESTDATA); if (!tmdata) { genlmsg_cancel(skb, hdr); break; @@ -10546,8 +10689,9 @@ static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) if (!cqm) return -EINVAL; - err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm, - nl80211_attr_cqm_policy, info->extack); + err = nla_parse_nested_deprecated(attrs, NL80211_ATTR_CQM_MAX, cqm, + nl80211_attr_cqm_policy, + info->extack); if (err) return err; @@ -10739,12 +10883,12 @@ static int nl80211_send_wowlan_patterns(struct sk_buff *msg, if (!wowlan->n_patterns) return 0; - nl_pats = nla_nest_start(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN); + nl_pats = nla_nest_start_noflag(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN); if (!nl_pats) return -ENOBUFS; for (i = 0; i < wowlan->n_patterns; i++) { - nl_pat = nla_nest_start(msg, i + 1); + nl_pat = nla_nest_start_noflag(msg, i + 1); if (!nl_pat) return -ENOBUFS; pat_len = wowlan->patterns[i].pattern_len; @@ -10770,7 +10914,8 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg, if (!tcp) return 0; - nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION); + nl_tcp = nla_nest_start_noflag(msg, + NL80211_WOWLAN_TRIG_TCP_CONNECTION); if (!nl_tcp) return -ENOBUFS; @@ -10814,7 +10959,7 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (!req) return 0; - nd = nla_nest_start(msg, NL80211_WOWLAN_TRIG_NET_DETECT); + nd = nla_nest_start_noflag(msg, NL80211_WOWLAN_TRIG_NET_DETECT); if (!nd) return -ENOBUFS; @@ -10840,7 +10985,7 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, return -ENOBUFS; } - freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); + freqs = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!freqs) return -ENOBUFS; @@ -10852,12 +10997,13 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, nla_nest_end(msg, freqs); if (req->n_match_sets) { - matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH); + matches = nla_nest_start_noflag(msg, + NL80211_ATTR_SCHED_SCAN_MATCH); if (!matches) return -ENOBUFS; for (i = 0; i < req->n_match_sets; i++) { - match = nla_nest_start(msg, i); + match = nla_nest_start_noflag(msg, i); if (!match) return -ENOBUFS; @@ -10870,12 +11016,12 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, nla_nest_end(msg, matches); } - scan_plans = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_PLANS); + scan_plans = nla_nest_start_noflag(msg, NL80211_ATTR_SCHED_SCAN_PLANS); if (!scan_plans) return -ENOBUFS; for (i = 0; i < req->n_scan_plans; i++) { - scan_plan = nla_nest_start(msg, i + 1); + scan_plan = nla_nest_start_noflag(msg, i + 1); if (!scan_plan) return -ENOBUFS; @@ -10924,7 +11070,8 @@ static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info) if (rdev->wiphy.wowlan_config) { struct nlattr *nl_wowlan; - nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS); + nl_wowlan = nla_nest_start_noflag(msg, + NL80211_ATTR_WOWLAN_TRIGGERS); if (!nl_wowlan) goto nla_put_failure; @@ -10982,8 +11129,8 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, if (!rdev->wiphy.wowlan->tcp) return -EINVAL; - err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TCP, attr, - nl80211_wowlan_tcp_policy, NULL); + err = nla_parse_nested_deprecated(tb, MAX_NL80211_WOWLAN_TCP, attr, + nl80211_wowlan_tcp_policy, NULL); if (err) return err; @@ -11128,8 +11275,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, goto out; } - err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy, - NULL); + err = nla_parse_nested_deprecated(tb, NL80211_ATTR_MAX, attr, + nl80211_policy, NULL); if (err) goto out; @@ -11164,9 +11311,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto set_wakeup; } - err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TRIG, - info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS], - nl80211_wowlan_policy, info->extack); + err = nla_parse_nested_deprecated(tb, MAX_NL80211_WOWLAN_TRIG, + info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS], + nl80211_wowlan_policy, info->extack); if (err) return err; @@ -11248,9 +11395,11 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) rem) { u8 *mask_pat; - err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, - info->extack); + err = nla_parse_nested_deprecated(pat_tb, + MAX_NL80211_PKTPAT, + pat, + nl80211_packet_pattern_policy, + info->extack); if (err) goto error; @@ -11358,12 +11507,12 @@ static int nl80211_send_coalesce_rules(struct sk_buff *msg, if (!rdev->coalesce->n_rules) return 0; - nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE); + nl_rules = nla_nest_start_noflag(msg, NL80211_ATTR_COALESCE_RULE); if (!nl_rules) return -ENOBUFS; for (i = 0; i < rdev->coalesce->n_rules; i++) { - nl_rule = nla_nest_start(msg, i + 1); + nl_rule = nla_nest_start_noflag(msg, i + 1); if (!nl_rule) return -ENOBUFS; @@ -11376,13 +11525,13 @@ static int nl80211_send_coalesce_rules(struct sk_buff *msg, rule->condition)) return -ENOBUFS; - nl_pats = nla_nest_start(msg, - NL80211_ATTR_COALESCE_RULE_PKT_PATTERN); + nl_pats = nla_nest_start_noflag(msg, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN); if (!nl_pats) return -ENOBUFS; for (j = 0; j < rule->n_patterns; j++) { - nl_pat = nla_nest_start(msg, j + 1); + nl_pat = nla_nest_start_noflag(msg, j + 1); if (!nl_pat) return -ENOBUFS; pat_len = rule->patterns[j].pattern_len; @@ -11463,8 +11612,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, int rem, pat_len, mask_len, pkt_offset, n_patterns = 0; struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; - err = nla_parse_nested(tb, NL80211_ATTR_COALESCE_RULE_MAX, rule, - nl80211_coalesce_policy, NULL); + err = nla_parse_nested_deprecated(tb, NL80211_ATTR_COALESCE_RULE_MAX, + rule, nl80211_coalesce_policy, NULL); if (err) return err; @@ -11499,8 +11648,10 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, NULL); + err = nla_parse_nested_deprecated(pat_tb, MAX_NL80211_PKTPAT, + pat, + nl80211_packet_pattern_policy, + NULL); if (err) return err; @@ -11622,9 +11773,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_REKEY_DATA]) return -EINVAL; - err = nla_parse_nested(tb, MAX_NL80211_REKEY_DATA, - info->attrs[NL80211_ATTR_REKEY_DATA], - nl80211_rekey_policy, info->extack); + err = nla_parse_nested_deprecated(tb, MAX_NL80211_REKEY_DATA, + info->attrs[NL80211_ATTR_REKEY_DATA], + nl80211_rekey_policy, info->extack); if (err) return err; @@ -11936,9 +12087,10 @@ static int nl80211_nan_add_func(struct sk_buff *skb, if (!info->attrs[NL80211_ATTR_NAN_FUNC]) return -EINVAL; - err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX, - info->attrs[NL80211_ATTR_NAN_FUNC], - nl80211_nan_func_policy, info->extack); + err = nla_parse_nested_deprecated(tb, NL80211_NAN_FUNC_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_FUNC], + nl80211_nan_func_policy, + info->extack); if (err) return err; @@ -12034,9 +12186,11 @@ static int nl80211_nan_add_func(struct sk_buff *skb, if (tb[NL80211_NAN_FUNC_SRF]) { struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR]; - err = nla_parse_nested(srf_tb, NL80211_NAN_SRF_ATTR_MAX, - tb[NL80211_NAN_FUNC_SRF], - nl80211_nan_srf_policy, info->extack); + err = nla_parse_nested_deprecated(srf_tb, + NL80211_NAN_SRF_ATTR_MAX, + tb[NL80211_NAN_FUNC_SRF], + nl80211_nan_srf_policy, + info->extack); if (err) goto out; @@ -12134,7 +12288,7 @@ out: NL80211_ATTR_PAD)) goto nla_put_failure; - func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + func_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_FUNC); if (!func_attr) goto nla_put_failure; @@ -12251,11 +12405,12 @@ void cfg80211_nan_match(struct wireless_dev *wdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr)) goto nla_put_failure; - match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH); + match_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_MATCH); if (!match_attr) goto nla_put_failure; - local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL); + local_func_attr = nla_nest_start_noflag(msg, + NL80211_NAN_MATCH_FUNC_LOCAL); if (!local_func_attr) goto nla_put_failure; @@ -12264,7 +12419,8 @@ void cfg80211_nan_match(struct wireless_dev *wdev, nla_nest_end(msg, local_func_attr); - peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER); + peer_func_attr = nla_nest_start_noflag(msg, + NL80211_NAN_MATCH_FUNC_PEER); if (!peer_func_attr) goto nla_put_failure; @@ -12330,7 +12486,7 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, NL80211_ATTR_PAD)) goto nla_put_failure; - func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + func_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_FUNC); if (!func_attr) goto nla_put_failure; @@ -12567,8 +12723,10 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, return 0; } - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, - nl80211_fam.maxattr, nl80211_policy, NULL); + err = nlmsg_parse_deprecated(cb->nlh, + GENL_HDRLEN + nl80211_fam.hdrsize, + attrbuf, nl80211_fam.maxattr, + nl80211_policy, NULL); if (err) return err; @@ -12679,7 +12837,8 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, break; } - vendor_data = nla_nest_start(skb, NL80211_ATTR_VENDOR_DATA); + vendor_data = nla_nest_start_noflag(skb, + NL80211_ATTR_VENDOR_DATA); if (!vendor_data) { genlmsg_cancel(skb, hdr); break; @@ -13223,7 +13382,8 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; - ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS); + ftm_stats_attr = nla_nest_start_noflag(msg, + NL80211_ATTR_FTM_RESPONDER_STATS); if (!ftm_stats_attr) goto nla_put_failure; @@ -13259,6 +13419,72 @@ nla_put_failure: return -ENOBUFS; } +static int nl80211_update_owe_info(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_update_owe_info owe_info; + struct net_device *dev = info->user_ptr[1]; + + if (!rdev->ops->update_owe_info) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_STATUS_CODE] || + !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + memset(&owe_info, 0, sizeof(owe_info)); + owe_info.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + nla_memcpy(owe_info.peer, info->attrs[NL80211_ATTR_MAC], ETH_ALEN); + + if (info->attrs[NL80211_ATTR_IE]) { + owe_info.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + owe_info.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + } + + return rdev_update_owe_info(rdev, dev, &owe_info); +} + +static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct station_info sinfo = {}; + const u8 *buf; + size_t len; + u8 *dest; + int err; + + if (!rdev->ops->probe_mesh_link || !rdev->ops->get_station) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_FRAME]) { + GENL_SET_ERR_MSG(info, "Frame or MAC missing"); + return -EINVAL; + } + + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + dest = nla_data(info->attrs[NL80211_ATTR_MAC]); + buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + + if (len < sizeof(struct ethhdr)) + return -EINVAL; + + if (!ether_addr_equal(buf, dest) || is_multicast_ether_addr(buf) || + !ether_addr_equal(buf + ETH_ALEN, dev->dev_addr)) + return -EINVAL; + + err = rdev_get_station(rdev, dev, dest, &sinfo); + if (err) + return err; + + return rdev_probe_mesh_link(rdev, dev, dest, buf, len); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13365,66 +13591,66 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_wiphy, .dumpit = nl80211_dump_wiphy, .done = nl80211_dump_wiphy_done, - .policy = nl80211_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WIPHY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wiphy, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_interface, .dumpit = nl80211_dump_interface, - .policy = nl80211_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_interface, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_interface, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_INTERFACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_interface, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_key, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_key, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | @@ -13432,8 +13658,8 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_NEW_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_key, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | @@ -13441,15 +13667,15 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_DEL_KEY, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_key, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BEACON, - .policy = nl80211_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_set_beacon, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | @@ -13457,7 +13683,7 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_START_AP, - .policy = nl80211_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_start_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | @@ -13465,7 +13691,7 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_STOP_AP, - .policy = nl80211_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, .doit = nl80211_stop_ap, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | @@ -13473,172 +13699,172 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_GET_STATION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_STATION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_station, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_STATION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_station, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_STATION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_station, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_MPATH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mpath, .dumpit = nl80211_dump_mpath, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_MPP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mpp, .dumpit = nl80211_dump_mpp, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MPATH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mpath, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_MPATH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_new_mpath, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_MPATH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_mpath, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BSS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_REG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_reg_do, .dumpit = nl80211_get_reg_dump, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_RTNL, /* can be retrieved by unprivileged users */ }, #ifdef CONFIG_CFG80211_CRDA_SUPPORT { .cmd = NL80211_CMD_SET_REG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_reg, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_RTNL, }, #endif { .cmd = NL80211_CMD_REQ_SET_REG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_req_set_reg, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = NL80211_CMD_RELOAD_REGDB, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_reload_regdb, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, { .cmd = NL80211_CMD_GET_MESH_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_mesh_config, - .policy = nl80211_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MESH_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_mesh_config, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_TRIGGER_SCAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_trigger_scan, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_ABORT_SCAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_abort_scan, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_SCAN, - .policy = nl80211_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nl80211_dump_scan, }, { .cmd = NL80211_CMD_START_SCHED_SCAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_sched_scan, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_STOP_SCHED_SCAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_sched_scan, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_AUTHENTICATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_authenticate, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | @@ -13646,40 +13872,41 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_ASSOCIATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_associate, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEAUTHENTICATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_deauthenticate, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DISASSOCIATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disassociate, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_JOIN_IBSS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ibss, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_LEAVE_IBSS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ibss, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -13687,9 +13914,9 @@ static const struct genl_ops nl80211_ops[] = { #ifdef CONFIG_NL80211_TESTMODE { .cmd = NL80211_CMD_TESTMODE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_testmode_do, .dumpit = nl80211_testmode_dump, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, @@ -13697,181 +13924,184 @@ static const struct genl_ops nl80211_ops[] = { #endif { .cmd = NL80211_CMD_CONNECT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_connect, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_connect_params, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DISCONNECT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_disconnect, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WIPHY_NETNS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_wiphy_netns, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_SURVEY, - .policy = nl80211_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nl80211_dump_survey, }, { .cmd = NL80211_CMD_SET_PMKSA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMKSA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_setdel_pmksa, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_FLUSH_PMKSA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_flush_pmksa, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_remain_on_channel, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_cancel_remain_on_channel, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_tx_bitrate_mask, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_REGISTER_FRAME, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_mgmt, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_FRAME, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_FRAME_WAIT_CANCEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_mgmt_cancel_wait, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_POWER_SAVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_power_save, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_POWER_SAVE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_power_save, - .policy = nl80211_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_CQM, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_cqm, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_CHANNEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_channel, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WDS_PEER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wds_peer, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_JOIN_MESH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_mesh, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_LEAVE_MESH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_mesh, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_JOIN_OCB, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_join_ocb, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_LEAVE_OCB, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_leave_ocb, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -13879,16 +14109,16 @@ static const struct genl_ops nl80211_ops[] = { #ifdef CONFIG_PM { .cmd = NL80211_CMD_GET_WOWLAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_wowlan, - .policy = nl80211_policy, /* can be retrieved by unprivileged users */ .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WOWLAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_wowlan, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, @@ -13896,8 +14126,8 @@ static const struct genl_ops nl80211_ops[] = { #endif { .cmd = NL80211_CMD_SET_REKEY_OFFLOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_rekey_data, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL | @@ -13905,290 +14135,306 @@ static const struct genl_ops nl80211_ops[] = { }, { .cmd = NL80211_CMD_TDLS_MGMT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_mgmt, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_TDLS_OPER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_oper, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_UNEXPECTED_FRAME, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_unexpected_frame, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_PROBE_CLIENT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_probe_client, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_REGISTER_BEACONS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_register_beacons, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_NOACK_MAP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_noack_map, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_START_P2P_DEVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_p2p_device, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_STOP_P2P_DEVICE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_p2p_device, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_START_NAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_nan, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_STOP_NAN, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_stop_nan, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_ADD_NAN_FUNCTION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_add_func, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_NAN_FUNCTION, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_del_func, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_nan_change_config, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MCAST_RATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mcast_rate, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MAC_ACL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_mac_acl, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_RADAR_DETECT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_start_radar_detection, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_protocol_features, - .policy = nl80211_policy, }, { .cmd = NL80211_CMD_UPDATE_FT_IES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_update_ft_ies, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_START, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_start, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_crit_protocol_stop, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_COALESCE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_coalesce, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_COALESCE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_coalesce, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CHANNEL_SWITCH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_channel_switch, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_VENDOR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_vendor_cmd, .dumpit = nl80211_vendor_cmd_dump, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_SET_QOS_MAP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_qos_map, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_ADD_TX_TS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_add_tx_ts, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_TX_TS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_tx_ts, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_channel_switch, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tdls_cancel_channel_switch, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_multicast_to_unicast, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_PMK, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | - NL80211_FLAG_NEED_RTNL, + NL80211_FLAG_NEED_RTNL | + NL80211_FLAG_CLEAR_SKB, }, { .cmd = NL80211_CMD_DEL_PMK, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_EXTERNAL_AUTH, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_external_auth, - .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CONTROL_PORT_FRAME, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_tx_control_port, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_get_ftm_responder_stats, - .policy = nl80211_policy, .internal_flags = NL80211_FLAG_NEED_NETDEV | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_PEER_MEASUREMENT_START, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_pmsr_start, - .policy = nl80211_policy, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NOTIFY_RADAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_notify_radar_detection, - .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_UPDATE_OWE_INFO, + .doit = nl80211_update_owe_info, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_PROBE_MESH_LINK, + .doit = nl80211_probe_mesh_link, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, @@ -14200,6 +14446,7 @@ static struct genl_family nl80211_fam __ro_after_init = { .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ .maxattr = NL80211_ATTR_MAX, + .policy = nl80211_policy, .netnsok = true, .pre_doit = nl80211_pre_doit, .post_doit = nl80211_post_doit, @@ -14263,7 +14510,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, if (WARN_ON(!req)) return 0; - nest = nla_nest_start(msg, NL80211_ATTR_SCAN_SSIDS); + nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_SSIDS); if (!nest) goto nla_put_failure; for (i = 0; i < req->n_ssids; i++) { @@ -14272,7 +14519,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, } nla_nest_end(msg, nest); - nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); + nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!nest) goto nla_put_failure; for (i = 0; i < req->n_channels; i++) { @@ -14534,7 +14781,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, if (uapsd_queues >= 0) { struct nlattr *nla_wmm = - nla_nest_start(msg, NL80211_ATTR_STA_WME); + nla_nest_start_noflag(msg, NL80211_ATTR_STA_WME); if (!nla_wmm) goto nla_put_failure; @@ -14975,7 +15222,7 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, goto nla_put_failure; /* Before */ - nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE); + nl_freq = nla_nest_start_noflag(msg, NL80211_ATTR_FREQ_BEFORE); if (!nl_freq) goto nla_put_failure; @@ -14984,7 +15231,7 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, nla_nest_end(msg, nl_freq); /* After */ - nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER); + nl_freq = nla_nest_start_noflag(msg, NL80211_ATTR_FREQ_AFTER); if (!nl_freq) goto nla_put_failure; @@ -15418,7 +15665,7 @@ static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev, if (mac && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac)) goto nla_put_failure; - cb[1] = nla_nest_start(msg, NL80211_ATTR_CQM); + cb[1] = nla_nest_start_noflag(msg, NL80211_ATTR_CQM); if (!cb[1]) goto nla_put_failure; @@ -15579,7 +15826,7 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; - rekey_attr = nla_nest_start(msg, NL80211_ATTR_REKEY_DATA); + rekey_attr = nla_nest_start_noflag(msg, NL80211_ATTR_REKEY_DATA); if (!rekey_attr) goto nla_put_failure; @@ -15634,7 +15881,7 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) goto nla_put_failure; - attr = nla_nest_start(msg, NL80211_ATTR_PMKSA_CANDIDATE); + attr = nla_nest_start_noflag(msg, NL80211_ATTR_PMKSA_CANDIDATE); if (!attr) goto nla_put_failure; @@ -15721,6 +15968,11 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; + + if (wdev->iftype == NL80211_IFTYPE_STATION && + !WARN_ON(!wdev->current_bss)) + wdev->current_bss->pub.channel = chandef->chan; + nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, NL80211_CMD_CH_SWITCH_NOTIFY, 0); } @@ -15939,15 +16191,15 @@ static int cfg80211_net_detect_results(struct sk_buff *msg, struct nlattr *nl_results, *nl_match, *nl_freqs; int i, j; - nl_results = nla_nest_start( - msg, NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS); + nl_results = nla_nest_start_noflag(msg, + NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS); if (!nl_results) return -EMSGSIZE; for (i = 0; i < nd->n_matches; i++) { struct cfg80211_wowlan_nd_match *match = nd->matches[i]; - nl_match = nla_nest_start(msg, i); + nl_match = nla_nest_start_noflag(msg, i); if (!nl_match) break; @@ -15965,8 +16217,8 @@ static int cfg80211_net_detect_results(struct sk_buff *msg, } if (match->n_channels) { - nl_freqs = nla_nest_start( - msg, NL80211_ATTR_SCAN_FREQUENCIES); + nl_freqs = nla_nest_start_noflag(msg, + NL80211_ATTR_SCAN_FREQUENCIES); if (!nl_freqs) { nla_nest_cancel(msg, nl_match); goto out; @@ -16025,7 +16277,8 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, if (wakeup) { struct nlattr *reasons; - reasons = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS); + reasons = nla_nest_start_noflag(msg, + NL80211_ATTR_WOWLAN_TRIGGERS); if (!reasons) goto free_msg; @@ -16364,6 +16617,46 @@ int cfg80211_external_auth_request(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_external_auth_request); +void cfg80211_update_owe_info_event(struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info, + gfp_t gfp) +{ + struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_update_owe_info_event(wiphy, netdev, owe_info); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_UPDATE_OWE_INFO); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, owe_info->peer)) + goto nla_put_failure; + + if (!owe_info->ie_len || + nla_put(msg, NL80211_ATTR_IE, owe_info->ie_len, owe_info->ie)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + return; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_update_owe_info_event); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 5e2ab01d325c..1b190475359a 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -25,7 +25,8 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, } /* no validation needed - was already done via nested policy */ - nla_parse_nested(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, NULL, NULL); + nla_parse_nested_deprecated(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq, + NULL, NULL); if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]); @@ -139,7 +140,8 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, int err, rem; /* no validation needed - was already done via nested policy */ - nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, NULL, NULL); + nla_parse_nested_deprecated(tb, NL80211_PMSR_PEER_ATTR_MAX, peer, + NULL, NULL); if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] || !tb[NL80211_PMSR_PEER_ATTR_CHAN] || @@ -154,9 +156,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); /* need to validate here, we don't want to have validation recursion */ - err = nla_parse_nested(info->attrs, NL80211_ATTR_MAX, - tb[NL80211_PMSR_PEER_ATTR_CHAN], - nl80211_policy, info->extack); + err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, + tb[NL80211_PMSR_PEER_ATTR_CHAN], + nl80211_policy, info->extack); if (err) return err; @@ -165,9 +167,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, return err; /* no validation needed - was already done via nested policy */ - nla_parse_nested(req, NL80211_PMSR_REQ_ATTR_MAX, - tb[NL80211_PMSR_PEER_ATTR_REQ], - NULL, NULL); + nla_parse_nested_deprecated(req, NL80211_PMSR_REQ_ATTR_MAX, + tb[NL80211_PMSR_PEER_ATTR_REQ], NULL, + NULL); if (!req[NL80211_PMSR_REQ_ATTR_DATA]) { NL_SET_ERR_MSG_ATTR(info->extack, @@ -420,22 +422,22 @@ static int nl80211_pmsr_send_result(struct sk_buff *msg, { struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata; - pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS); + pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS); if (!pmsr) goto error; - peers = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS); + peers = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_PEERS); if (!peers) goto error; - peer = nla_nest_start(msg, 1); + peer = nla_nest_start_noflag(msg, 1); if (!peer) goto error; if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr)) goto error; - resp = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_RESP); + resp = nla_nest_start_noflag(msg, NL80211_PMSR_PEER_ATTR_RESP); if (!resp) goto error; @@ -452,11 +454,11 @@ static int nl80211_pmsr_send_result(struct sk_buff *msg, if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL)) goto error; - data = nla_nest_start(msg, NL80211_PMSR_RESP_ATTR_DATA); + data = nla_nest_start_noflag(msg, NL80211_PMSR_RESP_ATTR_DATA); if (!data) goto error; - typedata = nla_nest_start(msg, res->type); + typedata = nla_nest_start_noflag(msg, res->type); if (!typedata) goto error; diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 5cb48d135fab..e853a4fe6f97 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -77,7 +77,8 @@ static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct key_params *params) { int ret; - trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); + trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, + mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); @@ -1272,4 +1273,30 @@ rdev_abort_pmsr(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_update_owe_info *oweinfo) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); + if (rdev->ops->update_owe_info) + ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int +rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *dest, + const void *buf, size_t len) +{ + int ret; + + trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); + ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f1bf91eb226..4831ad745f91 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -427,14 +427,10 @@ static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; - int size_of_regd; unsigned int i; - size_of_regd = - sizeof(struct ieee80211_regdomain) + - src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule); - - regd = kzalloc(size_of_regd, GFP_KERNEL); + regd = kzalloc(struct_size(regd, reg_rules, src_regd->n_reg_rules), + GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); @@ -948,12 +944,10 @@ static int regdb_query_country(const struct fwdb_header *db, unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; - unsigned int size_of_regd, i; - - size_of_regd = sizeof(struct ieee80211_regdomain) + - coll->n_rules * sizeof(struct ieee80211_reg_rule); + unsigned int i; - regdom = kzalloc(size_of_regd, GFP_KERNEL); + regdom = kzalloc(struct_size(regdom, reg_rules, coll->n_rules), + GFP_KERNEL); if (!regdom) return -ENOMEM; @@ -1309,6 +1303,16 @@ reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1, return dfs_region1; } +static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1, + const struct ieee80211_wmm_ac *wmm_ac2, + struct ieee80211_wmm_ac *intersect) +{ + intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min); + intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max); + intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot); + intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn); +} + /* * Helper for regdom_intersect(), this does the real * mathematical intersection fun @@ -1323,6 +1327,8 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, struct ieee80211_freq_range *freq_range; const struct ieee80211_power_rule *power_rule1, *power_rule2; struct ieee80211_power_rule *power_rule; + const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2; + struct ieee80211_wmm_rule *wmm_rule; u32 freq_diff, max_bandwidth1, max_bandwidth2; freq_range1 = &rule1->freq_range; @@ -1333,6 +1339,10 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, power_rule2 = &rule2->power_rule; power_rule = &intersected_rule->power_rule; + wmm_rule1 = &rule1->wmm_rule; + wmm_rule2 = &rule2->wmm_rule; + wmm_rule = &intersected_rule->wmm_rule; + freq_range->start_freq_khz = max(freq_range1->start_freq_khz, freq_range2->start_freq_khz); freq_range->end_freq_khz = min(freq_range1->end_freq_khz, @@ -1376,6 +1386,29 @@ static int reg_rules_intersect(const struct ieee80211_regdomain *rd1, intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms, rule2->dfs_cac_ms); + if (rule1->has_wmm && rule2->has_wmm) { + u8 ac; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + reg_wmm_rules_intersect(&wmm_rule1->client[ac], + &wmm_rule2->client[ac], + &wmm_rule->client[ac]); + reg_wmm_rules_intersect(&wmm_rule1->ap[ac], + &wmm_rule2->ap[ac], + &wmm_rule->ap[ac]); + } + + intersected_rule->has_wmm = true; + } else if (rule1->has_wmm) { + *wmm_rule = *wmm_rule1; + intersected_rule->has_wmm = true; + } else if (rule2->has_wmm) { + *wmm_rule = *wmm_rule2; + intersected_rule->has_wmm = true; + } else { + intersected_rule->has_wmm = false; + } + if (!is_valid_reg_rule(intersected_rule)) return -EINVAL; @@ -1450,7 +1483,7 @@ static struct ieee80211_regdomain * regdom_intersect(const struct ieee80211_regdomain *rd1, const struct ieee80211_regdomain *rd2) { - int r, size_of_regd; + int r; unsigned int x, y; unsigned int num_rules = 0; const struct ieee80211_reg_rule *rule1, *rule2; @@ -1481,10 +1514,7 @@ regdom_intersect(const struct ieee80211_regdomain *rd1, if (!num_rules) return NULL; - size_of_regd = sizeof(struct ieee80211_regdomain) + - num_rules * sizeof(struct ieee80211_reg_rule); - - rd = kzalloc(size_of_regd, GFP_KERNEL); + rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL); if (!rd) return NULL; @@ -3739,10 +3769,9 @@ void wiphy_regulatory_register(struct wiphy *wiphy) /* * The last request may have been received before this * registration call. Call the driver notifier if - * initiator is USER and user type is CELL_BASE. + * initiator is USER. */ - if (lr->initiator == NL80211_REGDOM_SET_BY_USER && - lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE) + if (lr->initiator == NL80211_REGDOM_SET_BY_USER) reg_call_notifier(wiphy, lr); } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 287518c6caa4..c04f5451f89b 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -179,21 +179,71 @@ static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, return true; } +bool cfg80211_is_element_inherited(const struct element *elem, + const struct element *non_inherit_elem) +{ + u8 id_len, ext_id_len, i, loop_len, id; + const u8 *list; + + if (elem->id == WLAN_EID_MULTIPLE_BSSID) + return false; + + if (!non_inherit_elem || non_inherit_elem->datalen < 2) + return true; + + /* + * non inheritance element format is: + * ext ID (56) | IDs list len | list | extension IDs list len | list + * Both lists are optional. Both lengths are mandatory. + * This means valid length is: + * elem_len = 1 (extension ID) + 2 (list len fields) + list lengths + */ + id_len = non_inherit_elem->data[1]; + if (non_inherit_elem->datalen < 3 + id_len) + return true; + + ext_id_len = non_inherit_elem->data[2 + id_len]; + if (non_inherit_elem->datalen < 3 + id_len + ext_id_len) + return true; + + if (elem->id == WLAN_EID_EXTENSION) { + if (!ext_id_len) + return true; + loop_len = ext_id_len; + list = &non_inherit_elem->data[3 + id_len]; + id = elem->data[0]; + } else { + if (!id_len) + return true; + loop_len = id_len; + list = &non_inherit_elem->data[2]; + id = elem->id; + } + + for (i = 0; i < loop_len; i++) { + if (list[i] == id) + return false; + } + + return true; +} +EXPORT_SYMBOL(cfg80211_is_element_inherited); + static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, const u8 *subelement, size_t subie_len, u8 *new_ie, gfp_t gfp) { u8 *pos, *tmp; const u8 *tmp_old, *tmp_new; + const struct element *non_inherit_elem; u8 *sub_copy; /* copy subelement as we need to change its content to * mark an ie after it is processed. */ - sub_copy = kmalloc(subie_len, gfp); + sub_copy = kmemdup(subelement, subie_len, gfp); if (!sub_copy) return 0; - memcpy(sub_copy, subelement, subie_len); pos = &new_ie[0]; @@ -204,6 +254,11 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, pos += (tmp_new[1] + 2); } + /* get non inheritance list if exists */ + non_inherit_elem = + cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub_copy, subie_len); + /* go through IEs in ie (skip SSID) and subelement, * merge them into new_ie */ @@ -224,8 +279,11 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, subie_len); if (!tmp) { + const struct element *old_elem = (void *)tmp_old; + /* ie in old ie but not in subelement */ - if (tmp_old[0] != WLAN_EID_MULTIPLE_BSSID) { + if (cfg80211_is_element_inherited(old_elem, + non_inherit_elem)) { memcpy(pos, tmp_old, tmp_old[1] + 2); pos += tmp_old[1] + 2; } @@ -269,8 +327,7 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, tmp_new = sub_copy; while (tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { if (!(tmp_new[0] == WLAN_EID_NON_TX_BSSID_CAP || - tmp_new[0] == WLAN_EID_SSID || - tmp_new[0] == WLAN_EID_MULTI_BSSID_IDX)) { + tmp_new[0] == WLAN_EID_SSID)) { memcpy(pos, tmp_new, tmp_new[1] + 2); pos += tmp_new[1] + 2; } @@ -1398,6 +1455,78 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, return &res->pub; } +static const struct element +*cfg80211_get_profile_continuation(const u8 *ie, size_t ielen, + const struct element *mbssid_elem, + const struct element *sub_elem) +{ + const u8 *mbssid_end = mbssid_elem->data + mbssid_elem->datalen; + const struct element *next_mbssid; + const struct element *next_sub; + + next_mbssid = cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, + mbssid_end, + ielen - (mbssid_end - ie)); + + /* + * If is is not the last subelement in current MBSSID IE or there isn't + * a next MBSSID IE - profile is complete. + */ + if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) || + !next_mbssid) + return NULL; + + /* For any length error, just return NULL */ + + if (next_mbssid->datalen < 4) + return NULL; + + next_sub = (void *)&next_mbssid->data[1]; + + if (next_mbssid->data + next_mbssid->datalen < + next_sub->data + next_sub->datalen) + return NULL; + + if (next_sub->id != 0 || next_sub->datalen < 2) + return NULL; + + /* + * Check if the first element in the next sub element is a start + * of a new profile + */ + return next_sub->data[0] == WLAN_EID_NON_TX_BSSID_CAP ? + NULL : next_mbssid; +} + +size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, + const struct element *mbssid_elem, + const struct element *sub_elem, + u8 *merged_ie, size_t max_copy_len) +{ + size_t copied_len = sub_elem->datalen; + const struct element *next_mbssid; + + if (sub_elem->datalen > max_copy_len) + return 0; + + memcpy(merged_ie, sub_elem->data, sub_elem->datalen); + + while ((next_mbssid = cfg80211_get_profile_continuation(ie, ielen, + mbssid_elem, + sub_elem))) { + const struct element *next_sub = (void *)&next_mbssid->data[1]; + + if (copied_len + next_sub->datalen > max_copy_len) + break; + memcpy(merged_ie + copied_len, next_sub->data, + next_sub->datalen); + copied_len += next_sub->datalen; + } + + return copied_len; +} +EXPORT_SYMBOL(cfg80211_merge_profile); + static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, @@ -1411,7 +1540,8 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, const struct element *elem, *sub; size_t new_ie_len; u8 new_bssid[ETH_ALEN]; - u8 *new_ie; + u8 *new_ie, *profile; + u64 seen_indices = 0; u16 capability; struct cfg80211_bss *bss; @@ -1429,10 +1559,16 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, if (!new_ie) return; + profile = kmalloc(ielen, gfp); + if (!profile) + goto out; + for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, ie, ielen) { if (elem->datalen < 4) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { + u8 profile_len; + if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; @@ -1447,16 +1583,31 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, continue; } + memset(profile, 0, ielen); + profile_len = cfg80211_merge_profile(ie, ielen, + elem, + sub, + profile, + ielen); + /* found a Nontransmitted BSSID Profile */ mbssid_index_ie = cfg80211_find_ie (WLAN_EID_MULTI_BSSID_IDX, - sub->data, sub->datalen); + profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || - mbssid_index_ie[2] == 0) { + mbssid_index_ie[2] == 0 || + mbssid_index_ie[2] > 46) { /* No valid Multiple BSSID-Index element */ continue; } + if (seen_indices & BIT(mbssid_index_ie[2])) + /* We don't support legacy split of a profile */ + net_dbg_ratelimited("Partial info for BSSID index %d\n", + mbssid_index_ie[2]); + + seen_indices |= BIT(mbssid_index_ie[2]); + non_tx_data->bssid_index = mbssid_index_ie[2]; non_tx_data->max_bssid_indicator = elem->data[0]; @@ -1465,13 +1616,14 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, non_tx_data->bssid_index, new_bssid); memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); - new_ie_len = cfg80211_gen_new_ie(ie, ielen, sub->data, - sub->datalen, new_ie, + new_ie_len = cfg80211_gen_new_ie(ie, ielen, + profile, + profile_len, new_ie, gfp); if (!new_ie_len) continue; - capability = get_unaligned_le16(sub->data + 2); + capability = get_unaligned_le16(profile + 2); bss = cfg80211_inform_single_bss_data(wiphy, data, ftype, new_bssid, tsf, @@ -1487,7 +1639,9 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, } } +out: kfree(new_ie); + kfree(profile); } struct cfg80211_bss * diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 44b2ce1bb13a..2abfff925aac 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -430,22 +430,43 @@ DECLARE_EVENT_CLASS(key_handle, BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) ); -DEFINE_EVENT(key_handle, rdev_add_key, +DEFINE_EVENT(key_handle, rdev_get_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); -DEFINE_EVENT(key_handle, rdev_get_key, +DEFINE_EVENT(key_handle, rdev_del_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); -DEFINE_EVENT(key_handle, rdev_del_key, +TRACE_EVENT(rdev_add_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, - bool pairwise, const u8 *mac_addr), - TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) + bool pairwise, const u8 *mac_addr, u8 mode), + TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(mac_addr) + __field(u8, key_index) + __field(bool, pairwise) + __field(u8, mode) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(mac_addr, mac_addr); + __entry->key_index = key_index; + __entry->pairwise = pairwise; + __entry->mode = mode; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, " + "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, + __entry->mode, BOOL_TO_STR(__entry->pairwise), + MAC_PR_ARG(mac_addr)) ); TRACE_EVENT(rdev_set_default_key, @@ -3362,6 +3383,62 @@ TRACE_EVENT(cfg80211_pmsr_complete, WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie) ); + +TRACE_EVENT(rdev_update_owe_info, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u16, status) + __dynamic_array(u8, ie, owe_info->ie_len)), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + __entry->status = owe_info->status; + memcpy(__get_dynamic_array(ie), + owe_info->ie, owe_info->ie_len);), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT + " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->status) +); + +TRACE_EVENT(cfg80211_update_owe_info_event, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info), + TP_ARGS(wiphy, netdev, owe_info), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __dynamic_array(u8, ie, owe_info->ie_len)), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, owe_info->peer); + memcpy(__get_dynamic_array(ie), owe_info->ie, + owe_info->ie_len);), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) +); + +TRACE_EVENT(rdev_probe_mesh_link, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *dest, const u8 *buf, size_t len), + TP_ARGS(wiphy, netdev, dest, buf, len), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest)) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index e4b8db5e81ec..cf63b635afc0 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -237,14 +237,23 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: - /* Disallow pairwise keys with non-zero index unless it's WEP - * or a vendor specific cipher (because current deployments use - * pairwise WEP keys with non-zero indices and for vendor - * specific ciphers this should be validated in the driver or - * hardware level - but 802.11i clearly specifies to use zero) + /* IEEE802.11-2016 allows only 0 and - when using Extended Key + * ID - 1 as index for pairwise keys. + * @NL80211_KEY_NO_TX is only allowed for pairwise keys when + * the driver supports Extended Key ID. + * @NL80211_KEY_SET_TX can't be set when installing and + * validating a key. */ - if (pairwise && key_idx) + if (params->mode == NL80211_KEY_NO_TX) { + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EXT_KEY_ID)) + return -EINVAL; + else if (!pairwise || key_idx < 0 || key_idx > 1) + return -EINVAL; + } else if ((pairwise && key_idx) || + params->mode == NL80211_KEY_SET_TX) { return -EINVAL; + } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: @@ -1220,9 +1229,11 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; - else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", - rate->bw, rate->he_ru_alloc)) + else { + WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc); return 0; + } /* now scale to the appropriate MCS */ tmp = result; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index d522787c7354..46e4d69db845 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -353,9 +353,6 @@ static int cfg80211_wext_siwretry(struct net_device *dev, changed |= WIPHY_PARAM_RETRY_SHORT; } - if (!changed) - return 0; - err = rdev_set_wiphy_params(rdev, changed); if (err) { wdev->wiphy->retry_short = oshort; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 20a511398389..0ea48a52ce79 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1398,18 +1398,6 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } - case SIOCGSTAMP: - rc = -EINVAL; - if (sk) - rc = sock_get_timestamp(sk, - (struct timeval __user *)argp); - break; - case SIOCGSTAMPNS: - rc = -EINVAL; - if (sk) - rc = sock_get_timestampns(sk, - (struct timespec __user *)argp); - break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1681,8 +1669,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); - struct sock *sk = sock->sk; - int rc = -ENOIOCTLCMD; switch(cmd) { @@ -1690,18 +1676,6 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; - case SIOCGSTAMP: - rc = -EINVAL; - if (sk) - rc = compat_sock_get_timestamp(sk, - (struct timeval __user*)argp); - break; - case SIOCGSTAMPNS: - rc = -EINVAL; - if (sk) - rc = compat_sock_get_timestampns(sk, - (struct timespec __user*)argp); - break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1765,6 +1739,7 @@ static const struct proto_ops x25_proto_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif + .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 989e52386c35..2b18223e7eb8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 610c0bdc0c2b..88b9ae24658d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -43,6 +43,48 @@ struct xsk_queue { u64 invalid_descs; }; +/* The structure of the shared state of the rings are the same as the + * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion + * ring, the kernel is the producer and user space is the consumer. For + * the Tx and fill rings, the kernel is the consumer and user space is + * the producer. + * + * producer consumer + * + * if (LOAD ->consumer) { LOAD ->producer + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->producer STORE ->consumer + * } + * + * (A) pairs with (D), and (B) pairs with (C). + * + * Starting with (B), it protects the data from being written after + * the producer pointer. If this barrier was missing, the consumer + * could observe the producer pointer being set and thus load the data + * before the producer has written the new data. The consumer would in + * this case load the old data. + * + * (C) protects the consumer from speculatively loading the data before + * the producer pointer actually has been read. If we do not have this + * barrier, some architectures could load old data as speculative loads + * are not discarded as the CPU does not know there is a dependency + * between ->producer and data. + * + * (A) is a control dependency that separates the load of ->consumer + * from the stores of $data. In case ->consumer indicates there is no + * room in the buffer to store $data we do not. So no barrier is needed. + * + * (D) protects the load of the data to be observed to happen after the + * store of the consumer pointer. If we did not have this memory + * barrier, the producer could observe the consumer pointer being set + * and overwrite the data with a new value before the consumer got the + * chance to read the old value. The consumer would thus miss reading + * the old entry and very likely read the new entry twice, once right + * now and again after circling through the ring. + */ + /* Common functions operating for both RXTX and umem queues */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) @@ -106,6 +148,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) { if (q->cons_tail == q->cons_head) { + smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); @@ -128,10 +171,11 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; + /* A, matches D */ ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ WRITE_ONCE(q->ring->producer, q->prod_tail); return 0; @@ -144,6 +188,7 @@ static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) return -ENOSPC; + /* A, matches D */ ring->desc[q->prod_head++ & q->ring_mask] = addr; return 0; } @@ -152,7 +197,7 @@ static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, u32 nb_entries) { /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ q->prod_tail += nb_entries; WRITE_ONCE(q->ring->producer, q->prod_tail); @@ -163,6 +208,7 @@ static inline int xskq_reserve_addr(struct xsk_queue *q) if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; + /* A, matches D */ q->prod_head++; return 0; } @@ -204,11 +250,12 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, struct xdp_desc *desc) { if (q->cons_tail == q->cons_head) { + smp_mb(); /* D, matches A */ WRITE_ONCE(q->ring->consumer, q->cons_tail); q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); /* Order consumer and data */ - smp_rmb(); + smp_rmb(); /* C, matches B */ } return xskq_validate_desc(q, desc); @@ -228,6 +275,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; + /* A, matches D */ idx = (q->prod_head++) & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; @@ -238,7 +286,7 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, static inline void xskq_produce_flush_desc(struct xsk_queue *q) { /* Order producer and data */ - smp_wmb(); + smp_wmb(); /* B, matches C */ q->prod_tail = q->prod_head, WRITE_ONCE(q->ring->producer, q->prod_tail); diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 5d43aaa17027..1ec8071226b2 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -3,7 +3,7 @@ # config XFRM bool - depends on NET + depends on INET select GRO_CELLS select SKB_EXTENSIONS @@ -15,9 +15,9 @@ config XFRM_ALGO select XFRM select CRYPTO +if INET config XFRM_USER tristate "Transformation user configuration interface" - depends on INET select XFRM_ALGO ---help--- Support for Transformation(XFRM) user configuration interface @@ -56,7 +56,7 @@ config XFRM_MIGRATE config XFRM_STATISTICS bool "Transformation statistics" - depends on INET && XFRM && PROC_FS + depends on XFRM && PROC_FS ---help--- This statistics is not a SNMP/MIB specification but shows statistics about transformation error (or almost error) factor @@ -95,3 +95,5 @@ config NET_KEY_MIGRATE <draft-sugimoto-mip6-pfkey-migrate>. If unsure, say N. + +endif # INET diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index b8736f56e7f7..b24cd86a02c3 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -23,6 +23,60 @@ #include <linux/notifier.h> #ifdef CONFIG_XFRM_OFFLOAD +static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb, + unsigned int hsize) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + hsize + x->props.header_len); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb_reset_transport_header(skb); + skb->transport_header -= x->props.header_len; + } +} + +static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, + unsigned int hsize) + +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo->flags & XFRM_GSO_SEGMENT) + skb->transport_header = skb->network_header + hsize; + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + x->props.header_len); +} + +/* Adjust pointers into the packet when IPsec is done at layer2 */ +static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->outer_mode.encap) { + case XFRM_MODE_TUNNEL: + if (x->outer_mode.family == AF_INET) + return __xfrm_mode_tunnel_prep(x, skb, + sizeof(struct iphdr)); + if (x->outer_mode.family == AF_INET6) + return __xfrm_mode_tunnel_prep(x, skb, + sizeof(struct ipv6hdr)); + break; + case XFRM_MODE_TRANSPORT: + if (x->outer_mode.family == AF_INET) + return __xfrm_transport_prep(x, skb, + sizeof(struct iphdr)); + if (x->outer_mode.family == AF_INET6) + return __xfrm_transport_prep(x, skb, + sizeof(struct ipv6hdr)); + break; + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + case XFRM_MODE_BEET: + break; + } +} + struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; @@ -78,7 +132,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur } if (!skb->next) { - x->outer_mode->xmit(x, skb); + esp_features |= skb->dev->gso_partial_features; + xfrm_outer_mode_prep(x, skb); xo->flags |= XFRM_DEV_RESUME; @@ -101,12 +156,14 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur do { struct sk_buff *nskb = skb2->next; + + esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; - x->outer_mode->xmit(x, skb2); + xfrm_outer_mode_prep(x, skb2); err = x->type_offload->xmit(x, skb2, esp_features); if (!err) { @@ -247,7 +304,7 @@ void xfrm_dev_resume(struct sk_buff *skb) unsigned long flags; rcu_read_lock(); - txq = netdev_pick_tx(dev, skb, NULL); + txq = netdev_core_pick_tx(dev, skb, NULL); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h new file mode 100644 index 000000000000..c7b0318938e2 --- /dev/null +++ b/net/xfrm/xfrm_inout.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/ipv6.h> +#include <net/dsfield.h> +#include <net/xfrm.h> + +#ifndef XFRM_INOUT_H +#define XFRM_INOUT_H 1 + +static inline void xfrm6_beet_make_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + iph->version = 6; + + memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(iph->flow_lbl)); + iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; + + ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); + iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; +} + +static inline void xfrm4_beet_make_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->ihl = 5; + iph->version = 4; + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + + iph->id = XFRM_MODE_SKB_CB(skb)->id; + iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; + iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; +} + +#endif diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b3b613660d44..314973aaa414 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -21,6 +21,8 @@ #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include "xfrm_inout.h" + struct xfrm_trans_tasklet { struct tasklet_struct tasklet; struct sk_buff_head queue; @@ -166,35 +168,299 @@ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) } EXPORT_SYMBOL(xfrm_parse_spi); -int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph; + int optlen = 0; + int err = -EINVAL; + + if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { + struct ip_beet_phdr *ph; + int phlen; + + if (!pskb_may_pull(skb, sizeof(*ph))) + goto out; + + ph = (struct ip_beet_phdr *)skb->data; + + phlen = sizeof(*ph) + ph->padlen; + optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); + if (optlen < 0 || optlen & 3 || optlen > 250) + goto out; + + XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; + + if (!pskb_may_pull(skb, phlen)) + goto out; + __skb_pull(skb, phlen); + } + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + xfrm4_beet_make_header(skb); + + iph = ip_hdr(skb); + + iph->ihl += optlen / 4; + iph->tot_len = htons(skb->len); + iph->daddr = x->sel.daddr.a4; + iph->saddr = x->sel.saddr.a4; + iph->check = 0; + iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); + err = 0; +out: + return err; +} + +static void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *inner_iph = ipip_hdr(skb); + + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_mode *inner_mode = x->inner_mode; + int err = -EINVAL; + + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) + goto out; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; + + err = 0; + +out: + return err; +} + +static void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *inner_iph = ipipv6_hdr(skb); + + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP6_ECN_set_ce(skb, inner_iph); +} + +static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), + ipipv6_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; + + err = 0; + +out: + return err; +} + +static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + int size = sizeof(struct ipv6hdr); int err; - err = x->outer_mode->afinfo->extract_input(x, skb); + err = skb_cow_head(skb, size + skb->mac_len); if (err) + goto out; + + __skb_push(skb, size); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + xfrm6_beet_make_header(skb); + + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(skb->len - size); + ip6h->daddr = x->sel.daddr.in6; + ip6h->saddr = x->sel.saddr.in6; + err = 0; +out: + return err; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation + * header. + * + * On entry, the transport header shall point to where the IP header + * should be and the network header shall be set to where the IP + * header currently is. skb->data shall point to the start of the + * payload. + */ +static int +xfrm_inner_mode_encap_remove(struct xfrm_state *x, + const struct xfrm_mode *inner_mode, + struct sk_buff *skb) +{ + switch (inner_mode->encap) { + case XFRM_MODE_BEET: + if (inner_mode->family == AF_INET) + return xfrm4_remove_beet_encap(x, skb); + if (inner_mode->family == AF_INET6) + return xfrm6_remove_beet_encap(x, skb); + break; + case XFRM_MODE_TUNNEL: + if (inner_mode->family == AF_INET) + return xfrm4_remove_tunnel_encap(x, skb); + if (inner_mode->family == AF_INET6) + return xfrm6_remove_tunnel_encap(x, skb); + break; + } + + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} + +static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) +{ + const struct xfrm_mode *inner_mode = &x->inner_mode; + const struct xfrm_state_afinfo *afinfo; + int err = -EAFNOSUPPORT; + + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); + if (likely(afinfo)) + err = afinfo->extract_input(x, skb); + + if (err) { + rcu_read_unlock(); return err; + } if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (inner_mode == NULL) + if (!inner_mode) { + rcu_read_unlock(); return -EAFNOSUPPORT; + } } - skb->protocol = inner_mode->afinfo->eth_proto; - return inner_mode->input2(x, skb); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (unlikely(!afinfo)) { + rcu_read_unlock(); + return -EAFNOSUPPORT; + } + + skb->protocol = afinfo->eth_proto; + rcu_read_unlock(); + return xfrm_inner_mode_encap_remove(x, inner_mode, skb); +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb_transport_header() shall point to where the IP header + * should be and skb_network_header() shall be set to where the IP header + * currently is. skb->data shall point to the start of the payload. + */ +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb_transport_header(skb); + + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ip_hdr(skb)->tot_len = htons(skb->len + ihl); + skb_reset_transport_header(skb); + return 0; +} + +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + int ihl = skb->data - skb_transport_header(skb); + + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); + return 0; +#else + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +#endif +} + +static int xfrm_inner_mode_input(struct xfrm_state *x, + const struct xfrm_mode *inner_mode, + struct sk_buff *skb) +{ + switch (inner_mode->encap) { + case XFRM_MODE_BEET: + case XFRM_MODE_TUNNEL: + return xfrm_prepare_input(x, skb); + case XFRM_MODE_TRANSPORT: + if (inner_mode->family == AF_INET) + return xfrm4_transport_input(x, skb); + if (inner_mode->family == AF_INET6) + return xfrm6_transport_input(x, skb); + break; + case XFRM_MODE_ROUTEOPTIMIZATION: + WARN_ON_ONCE(1); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return -EOPNOTSUPP; } -EXPORT_SYMBOL(xfrm_prepare_input); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { + const struct xfrm_state_afinfo *afinfo; struct net *net = dev_net(skb->dev); + const struct xfrm_mode *inner_mode; int err; __be32 seq; __be32 seq_hi; struct xfrm_state *x = NULL; xfrm_address_t *daddr; - struct xfrm_mode *inner_mode; u32 mark = skb->mark; unsigned int family = AF_UNSPEC; int decaps = 0; @@ -216,7 +482,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - family = x->outer_mode->afinfo->family; + family = x->outer_mode.family; /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { @@ -400,7 +666,7 @@ resume: XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); @@ -410,12 +676,12 @@ resume: } } - if (inner_mode->input(x, skb)) { + if (xfrm_inner_mode_input(x, inner_mode, skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { + if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; } @@ -425,7 +691,7 @@ resume: * transport mode so the outer address is identical. */ daddr = &x->id.daddr; - family = x->outer_mode->afinfo->family; + family = x->outer_mode.family; err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); if (err < 0) { @@ -453,7 +719,12 @@ resume: if (xo) xfrm_gro = xo->flags & XFRM_GRO; - err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); + err = -EAFNOSUPPORT; + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(x->inner_mode.family); + if (likely(afinfo)) + err = afinfo->transport_finish(skb, xfrm_gro || async); + rcu_read_unlock(); if (xfrm_gro) { sp = skb_sec_path(skb); if (sp) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index dbb3c1945b5c..ad3a2555c517 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -70,17 +70,28 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) return NULL; } -static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) +static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, + unsigned short family) { struct xfrmi_net *xfrmn; - int ifindex; struct xfrm_if *xi; + int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return NULL; + switch (family) { + case AF_INET6: + ifindex = inet6_sdif(skb); + break; + case AF_INET: + ifindex = inet_sdif(skb); + break; + } + if (!ifindex) + ifindex = skb->dev->ifindex; + xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); - ifindex = skb->dev->ifindex; for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { if (ifindex == xi->dev->ifindex && @@ -244,8 +255,8 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { + const struct xfrm_mode *inner_mode; struct pcpu_sw_netstats *tstats; - struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; @@ -273,7 +284,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); @@ -285,7 +296,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, - inner_mode->afinfo->family)) + inner_mode->family)) return -EPERM; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9333153bafda..a55510f9ff35 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -17,9 +17,13 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> +#include <net/inet_ecn.h> #include <net/xfrm.h> +#include "xfrm_inout.h" + static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); +static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); static int xfrm_skb_check_space(struct sk_buff *skb) { @@ -50,6 +54,360 @@ static struct dst_entry *skb_dst_pop(struct sk_buff *skb) return child; } +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + */ +static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + int ihl = iph->ihl * 4; + + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + ihl; + __skb_pull(skb, ihl); + memmove(skb_network_header(skb), iph, ihl); + return 0; +} + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + */ +static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; + skb_set_mac_header(skb, + (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + return 0; +#else + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +#endif +} + +/* Add route optimization header space. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the route optimization header. + */ +static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + if (hdr_len < 0) + return hdr_len; + skb_set_mac_header(skb, + (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + + x->lastused = ktime_get_real_seconds(); + + return 0; +#else + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +#endif +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + */ +static int xfrm4_beet_encap_add(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_beet_phdr *ph; + struct iphdr *top_iph; + int hdrlen, optlen; + + hdrlen = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - hdrlen + + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + xfrm4_beet_make_header(skb); + + ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); + + top_iph = ip_hdr(skb); + + if (unlikely(optlen)) { + if (WARN_ON(optlen < 0)) + return -EINVAL; + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->protocol; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); + + top_iph->protocol = IPPROTO_BEETPH; + top_iph->ihl = sizeof(struct iphdr) / 4; + } + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + + return 0; +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. + */ +static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct iphdr *top_iph; + int flags; + + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + top_iph = ip_hdr(skb); + + top_iph->ihl = 5; + top_iph->version = 4; + + top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family); + + /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */ + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + top_iph->tos = 0; + else + top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + top_iph->tos = INET_ECN_encapsulate(top_iph->tos, + XFRM_MODE_SKB_CB(skb)->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); + + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + ip_select_ident(dev_net(dst->dev), skb, NULL); + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + top_iph = ipv6_hdr(skb); + + top_iph->version = 6; + + memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); + + if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) + dsfield = 0; + else + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + return 0; +} + +static int xfrm6_beet_encap_add(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *top_iph; + struct ip_beet_phdr *ph; + int optlen, hdr_len; + + hdr_len = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - hdr_len); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdr_len); + + xfrm6_beet_make_header(skb); + + top_iph = ipv6_hdr(skb); + if (unlikely(optlen)) { + if (WARN_ON(optlen < 0)) + return -EINVAL; + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->nexthdr; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); + + top_iph->nexthdr = IPPROTO_BEETPH; + } + + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + return 0; +} +#endif + +/* Add encapsulation header. + * + * On exit, the transport header will be set to the start of the + * encapsulation header to be filled in by x->type->output and the mac + * header will be set to the nextheader (protocol for IPv4) field of the + * extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. + * The value of the network header will always point to the top IP header + * while skb->data will point to the payload. + */ +static int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm_inner_extract_output(x, skb); + if (err) + return err; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + skb->protocol = htons(ETH_P_IP); + + switch (x->outer_mode.encap) { + case XFRM_MODE_BEET: + return xfrm4_beet_encap_add(x, skb); + case XFRM_MODE_TUNNEL: + return xfrm4_tunnel_encap_add(x, skb); + } + + WARN_ON_ONCE(1); + return -EOPNOTSUPP; +} + +static int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + int err; + + err = xfrm_inner_extract_output(x, skb); + if (err) + return err; + + skb->ignore_df = 1; + skb->protocol = htons(ETH_P_IPV6); + + switch (x->outer_mode.encap) { + case XFRM_MODE_BEET: + return xfrm6_beet_encap_add(x, skb); + case XFRM_MODE_TUNNEL: + return xfrm6_tunnel_encap_add(x, skb); + default: + WARN_ON_ONCE(1); + return -EOPNOTSUPP; + } +#endif + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +} + +static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) +{ + switch (x->outer_mode.encap) { + case XFRM_MODE_BEET: + case XFRM_MODE_TUNNEL: + if (x->outer_mode.family == AF_INET) + return xfrm4_prepare_output(x, skb); + if (x->outer_mode.family == AF_INET6) + return xfrm6_prepare_output(x, skb); + break; + case XFRM_MODE_TRANSPORT: + if (x->outer_mode.family == AF_INET) + return xfrm4_transport_output(x, skb); + if (x->outer_mode.family == AF_INET6) + return xfrm6_transport_output(x, skb); + break; + case XFRM_MODE_ROUTEOPTIMIZATION: + if (x->outer_mode.family == AF_INET6) + return xfrm6_ro_output(x, skb); + WARN_ON_ONCE(1); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return -EOPNOTSUPP; +} + +#if IS_ENABLED(CONFIG_NET_PKTGEN) +int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm_outer_mode_output(x, skb); +} +EXPORT_SYMBOL_GPL(pktgen_xfrm_outer_mode_output); +#endif + static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); @@ -68,7 +426,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb->mark = xfrm_smark_get(skb->mark, x); - err = x->outer_mode->output(x, skb); + err = xfrm_outer_mode_output(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); goto error_nolock; @@ -131,7 +489,7 @@ resume: } skb_dst_set(skb, dst); x = dst->xfrm; - } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); + } while (x && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL)); return 0; @@ -258,20 +616,29 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output); -int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_mode *inner_mode; + const struct xfrm_state_afinfo *afinfo; + const struct xfrm_mode *inner_mode; + int err = -EAFNOSUPPORT; + if (x->sel.family == AF_UNSPEC) inner_mode = xfrm_ip2inner_mode(x, xfrm_af2proto(skb_dst(skb)->ops->family)); else - inner_mode = x->inner_mode; + inner_mode = &x->inner_mode; if (inner_mode == NULL) return -EAFNOSUPPORT; - return inner_mode->afinfo->extract_output(x, skb); + + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + err = afinfo->extract_output(x, skb); + rcu_read_unlock(); + + return err; } -EXPORT_SYMBOL_GPL(xfrm_inner_extract_output); void xfrm_local_error(struct sk_buff *skb, int mtu) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8d1a898d0ba5..410233c5681e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -27,10 +27,14 @@ #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> +#include <linux/if_tunnel.h> #include <net/dst.h> #include <net/flow.h> #include <net/xfrm.h> #include <net/ip.h> +#if IS_ENABLED(CONFIG_IPV6_MIP6) +#include <net/mip6.h> +#endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif @@ -2450,18 +2454,10 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static int xfrm_get_tos(const struct flowi *fl, int family) { - const struct xfrm_policy_afinfo *afinfo; - int tos; + if (family == AF_INET) + return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; - afinfo = xfrm_policy_get_afinfo(family); - if (!afinfo) - return 0; - - tos = afinfo->get_tos(fl); - - rcu_read_unlock(); - - return tos; + return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) @@ -2499,21 +2495,14 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) return xdst; } -static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, - int nfheader_len) +static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) { - const struct xfrm_policy_afinfo *afinfo = - xfrm_policy_get_afinfo(dst->ops->family); - int err; - - if (!afinfo) - return -EINVAL; - - err = afinfo->init_path(path, dst, nfheader_len); - - rcu_read_unlock(); - - return err; + if (dst->ops->family == AF_INET6) { + struct rt6_info *rt = (struct rt6_info *)dst; + path->path_cookie = rt6_get_cookie(rt); + path->u.rt6.rt6i_nfheader_len = nfheader_len; + } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, @@ -2545,10 +2534,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, const struct flowi *fl, struct dst_entry *dst) { + const struct xfrm_state_afinfo *afinfo; + const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; - struct xfrm_mode *inner_mode; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; @@ -2594,7 +2584,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } } else - inner_mode = xfrm[i]->inner_mode; + inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); @@ -2622,7 +2612,14 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = inner_mode->afinfo->output; + + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + dst1->output = afinfo->output; + else + dst1->output = dst_discard_out; + rcu_read_unlock(); xdst_prev = xdst; @@ -3263,20 +3260,229 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star return start; } +static void +decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) +{ + const struct iphdr *iph = ip_hdr(skb); + u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + struct flowi4 *fl4 = &fl->u.ip4; + int oif = 0; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; + + memset(fl4, 0, sizeof(struct flowi4)); + fl4->flowi4_mark = skb->mark; + fl4->flowi4_oif = reverse ? skb->skb_iif : oif; + + if (!ip_is_fragment(iph)) { + switch (iph->protocol) { + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ports; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ports = (__be16 *)xprth; + + fl4->fl4_sport = ports[!!reverse]; + fl4->fl4_dport = ports[!reverse]; + } + break; + case IPPROTO_ICMP: + if (xprth + 2 < skb->data || + pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp; + + xprth = skb_network_header(skb) + iph->ihl * 4; + icmp = xprth; + + fl4->fl4_icmp_type = icmp[0]; + fl4->fl4_icmp_code = icmp[1]; + } + break; + case IPPROTO_ESP: + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be32 *ehdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ehdr = (__be32 *)xprth; + + fl4->fl4_ipsec_spi = ehdr[0]; + } + break; + case IPPROTO_AH: + if (xprth + 8 < skb->data || + pskb_may_pull(skb, xprth + 8 - skb->data)) { + __be32 *ah_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ah_hdr = (__be32 *)xprth; + + fl4->fl4_ipsec_spi = ah_hdr[1]; + } + break; + case IPPROTO_COMP: + if (xprth + 4 < skb->data || + pskb_may_pull(skb, xprth + 4 - skb->data)) { + __be16 *ipcomp_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + ipcomp_hdr = (__be16 *)xprth; + + fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); + } + break; + case IPPROTO_GRE: + if (xprth + 12 < skb->data || + pskb_may_pull(skb, xprth + 12 - skb->data)) { + __be16 *greflags; + __be32 *gre_hdr; + + xprth = skb_network_header(skb) + iph->ihl * 4; + greflags = (__be16 *)xprth; + gre_hdr = (__be32 *)xprth; + + if (greflags[0] & GRE_KEY) { + if (greflags[0] & GRE_CSUM) + gre_hdr++; + fl4->fl4_gre_key = gre_hdr[1]; + } + } + break; + default: + fl4->fl4_ipsec_spi = 0; + break; + } + } + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; +} + +#if IS_ENABLED(CONFIG_IPV6) +static void +decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) +{ + struct flowi6 *fl6 = &fl->u.ip6; + int onlyproto = 0; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + u32 offset = sizeof(*hdr); + struct ipv6_opt_hdr *exthdr; + const unsigned char *nh = skb_network_header(skb); + u16 nhoff = IP6CB(skb)->nhoff; + int oif = 0; + u8 nexthdr; + + if (!nhoff) + nhoff = offsetof(struct ipv6hdr, nexthdr); + + nexthdr = nh[nhoff]; + + if (skb_dst(skb)) + oif = skb_dst(skb)->dev->ifindex; + + memset(fl6, 0, sizeof(struct flowi6)); + fl6->flowi6_mark = skb->mark; + fl6->flowi6_oif = reverse ? skb->skb_iif : oif; + + fl6->daddr = reverse ? hdr->saddr : hdr->daddr; + fl6->saddr = reverse ? hdr->daddr : hdr->saddr; + + while (nh + offset + sizeof(*exthdr) < skb->data || + pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { + nh = skb_network_header(skb); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + onlyproto = 1; + /* fall through */ + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + if (!onlyproto && (nh + offset + 4 < skb->data || + pskb_may_pull(skb, nh + offset + 4 - skb->data))) { + __be16 *ports; + + nh = skb_network_header(skb); + ports = (__be16 *)(nh + offset); + fl6->fl6_sport = ports[!!reverse]; + fl6->fl6_dport = ports[!reverse]; + } + fl6->flowi6_proto = nexthdr; + return; + case IPPROTO_ICMPV6: + if (!onlyproto && (nh + offset + 2 < skb->data || + pskb_may_pull(skb, nh + offset + 2 - skb->data))) { + u8 *icmp; + + nh = skb_network_header(skb); + icmp = (u8 *)(nh + offset); + fl6->fl6_icmp_type = icmp[0]; + fl6->fl6_icmp_code = icmp[1]; + } + fl6->flowi6_proto = nexthdr; + return; +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPPROTO_MH: + offset += ipv6_optlen(exthdr); + if (!onlyproto && (nh + offset + 3 < skb->data || + pskb_may_pull(skb, nh + offset + 3 - skb->data))) { + struct ip6_mh *mh; + + nh = skb_network_header(skb); + mh = (struct ip6_mh *)(nh + offset); + fl6->fl6_mh_type = mh->ip6mh_type; + } + fl6->flowi6_proto = nexthdr; + return; +#endif + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl6->fl6_ipsec_spi = 0; + fl6->flowi6_proto = nexthdr; + return; + } + } +} +#endif + int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { - const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - int err; - - if (unlikely(afinfo == NULL)) + switch (family) { + case AF_INET: + decode_session4(skb, fl, reverse); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + decode_session6(skb, fl, reverse); + break; +#endif + default: return -EAFNOSUPPORT; + } - afinfo->decode_session(skb, fl, reverse); - - err = security_xfrm_decode_session(skb, &fl->flowi_secid); - rcu_read_unlock(); - return err; + return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); @@ -3313,7 +3519,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, ifcb = xfrm_if_get_cb(); if (ifcb) { - xi = ifcb->decode_session(skb); + xi = ifcb->decode_session(skb, family); if (xi) { if_id = xi->p.if_id; net = xi->net; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1bb971f46fc6..c5d81316330b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -173,7 +173,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock); int __xfrm_state_delete(struct xfrm_state *x); int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); -bool km_is_alive(const struct km_event *c); +static bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); static DEFINE_SPINLOCK(xfrm_type_lock); @@ -330,100 +330,67 @@ static void xfrm_put_type_offload(const struct xfrm_type_offload *type) module_put(type->owner); } -static DEFINE_SPINLOCK(xfrm_mode_lock); -int xfrm_register_mode(struct xfrm_mode *mode, int family) -{ - struct xfrm_state_afinfo *afinfo; - struct xfrm_mode **modemap; - int err; - - if (unlikely(mode->encap >= XFRM_MODE_MAX)) - return -EINVAL; - - afinfo = xfrm_state_get_afinfo(family); - if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - - err = -EEXIST; - modemap = afinfo->mode_map; - spin_lock_bh(&xfrm_mode_lock); - if (modemap[mode->encap]) - goto out; - - err = -ENOENT; - if (!try_module_get(afinfo->owner)) - goto out; - - mode->afinfo = afinfo; - modemap[mode->encap] = mode; - err = 0; - -out: - spin_unlock_bh(&xfrm_mode_lock); - rcu_read_unlock(); - return err; -} -EXPORT_SYMBOL(xfrm_register_mode); - -int xfrm_unregister_mode(struct xfrm_mode *mode, int family) -{ - struct xfrm_state_afinfo *afinfo; - struct xfrm_mode **modemap; - int err; - - if (unlikely(mode->encap >= XFRM_MODE_MAX)) - return -EINVAL; - - afinfo = xfrm_state_get_afinfo(family); - if (unlikely(afinfo == NULL)) - return -EAFNOSUPPORT; - - err = -ENOENT; - modemap = afinfo->mode_map; - spin_lock_bh(&xfrm_mode_lock); - if (likely(modemap[mode->encap] == mode)) { - modemap[mode->encap] = NULL; - module_put(mode->afinfo->owner); - err = 0; - } - - spin_unlock_bh(&xfrm_mode_lock); - rcu_read_unlock(); - return err; -} -EXPORT_SYMBOL(xfrm_unregister_mode); - -static struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) -{ - struct xfrm_state_afinfo *afinfo; - struct xfrm_mode *mode; - int modload_attempted = 0; +static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { + [XFRM_MODE_BEET] = { + .encap = XFRM_MODE_BEET, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, + [XFRM_MODE_TRANSPORT] = { + .encap = XFRM_MODE_TRANSPORT, + .family = AF_INET, + }, + [XFRM_MODE_TUNNEL] = { + .encap = XFRM_MODE_TUNNEL, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, +}; + +static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { + [XFRM_MODE_BEET] = { + .encap = XFRM_MODE_BEET, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, + [XFRM_MODE_ROUTEOPTIMIZATION] = { + .encap = XFRM_MODE_ROUTEOPTIMIZATION, + .family = AF_INET6, + }, + [XFRM_MODE_TRANSPORT] = { + .encap = XFRM_MODE_TRANSPORT, + .family = AF_INET6, + }, + [XFRM_MODE_TUNNEL] = { + .encap = XFRM_MODE_TUNNEL, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, +}; + +static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + const struct xfrm_mode *mode; if (unlikely(encap >= XFRM_MODE_MAX)) return NULL; -retry: - afinfo = xfrm_state_get_afinfo(family); - if (unlikely(afinfo == NULL)) - return NULL; - - mode = READ_ONCE(afinfo->mode_map[encap]); - if (unlikely(mode && !try_module_get(mode->owner))) - mode = NULL; - - rcu_read_unlock(); - if (!mode && !modload_attempted) { - request_module("xfrm-mode-%d-%d", family, encap); - modload_attempted = 1; - goto retry; + switch (family) { + case AF_INET: + mode = &xfrm4_mode_map[encap]; + if (mode->family == family) + return mode; + break; + case AF_INET6: + mode = &xfrm6_mode_map[encap]; + if (mode->family == family) + return mode; + break; + default: + break; } - return mode; -} - -static void xfrm_put_mode(struct xfrm_mode *mode) -{ - module_put(mode->owner); + return NULL; } void xfrm_state_free(struct xfrm_state *x) @@ -434,7 +401,7 @@ EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { - tasklet_hrtimer_cancel(&x->mtimer); + hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); kfree(x->aead); kfree(x->aalg); @@ -444,12 +411,6 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); - if (x->inner_mode) - xfrm_put_mode(x->inner_mode); - if (x->inner_mode_iaf) - xfrm_put_mode(x->inner_mode_iaf); - if (x->outer_mode) - xfrm_put_mode(x->outer_mode); if (x->type_offload) xfrm_put_type_offload(x->type_offload); if (x->type) { @@ -479,8 +440,8 @@ static void xfrm_state_gc_task(struct work_struct *work) static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) { - struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); - struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); + struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer); + enum hrtimer_restart ret = HRTIMER_NORESTART; time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; @@ -544,7 +505,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) km_state_expired(x, 0, 0); resched: if (next != TIME64_MAX) { - tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); + hrtimer_forward_now(&x->mtimer, ktime_set(next, 0)); + ret = HRTIMER_RESTART; } goto out; @@ -561,7 +523,7 @@ expired: out: spin_unlock(&x->lock); - return HRTIMER_NORESTART; + return ret; } static void xfrm_replay_timer_handler(struct timer_list *t); @@ -580,8 +542,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); - tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, - CLOCK_BOOTTIME, HRTIMER_MODE_ABS); + hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); + x->mtimer.function = xfrm_timer_handler; timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; @@ -590,8 +552,6 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; - x->inner_mode = NULL; - x->inner_mode_iaf = NULL; spin_lock_init(&x->lock); } return x; @@ -1047,7 +1007,9 @@ found: hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); + hrtimer_start(&x->mtimer, + ktime_set(net->xfrm.sysctl_acq_expires, 0), + HRTIMER_MODE_REL_SOFT); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); @@ -1159,7 +1121,7 @@ static void __xfrm_state_insert(struct xfrm_state *x) hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } - tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); + hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); @@ -1266,7 +1228,9 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; xfrm_state_hold(x); - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); + hrtimer_start(&x->mtimer, + ktime_set(net->xfrm.sysctl_acq_expires, 0), + HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, daddr, saddr, family); @@ -1571,7 +1535,8 @@ out: memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); x1->km.dying = 0; - tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); + hrtimer_start(&x1->mtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); if (x1->curlft.use_time) xfrm_state_check_expire(x1); @@ -1610,7 +1575,7 @@ int xfrm_state_check_expire(struct xfrm_state *x) if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { x->km.state = XFRM_STATE_EXPIRED; - tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL); + hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT); return -EINVAL; } @@ -2066,7 +2031,7 @@ int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address } EXPORT_SYMBOL(km_report); -bool km_is_alive(const struct km_event *c) +static bool km_is_alive(const struct km_event *c) { struct xfrm_mgr *km; bool is_alive = false; @@ -2082,7 +2047,6 @@ bool km_is_alive(const struct km_event *c) return is_alive; } -EXPORT_SYMBOL(km_is_alive); int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) { @@ -2195,6 +2159,7 @@ struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family) return rcu_dereference(xfrm_state_afinfo[family]); } +EXPORT_SYMBOL_GPL(xfrm_state_afinfo_get_rcu); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { @@ -2242,8 +2207,9 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu) int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { - struct xfrm_state_afinfo *afinfo; - struct xfrm_mode *inner_mode; + const struct xfrm_state_afinfo *afinfo; + const struct xfrm_mode *inner_mode; + const struct xfrm_mode *outer_mode; int family = x->props.family; int err; @@ -2269,25 +2235,22 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) goto error; if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) && - family != x->sel.family) { - xfrm_put_mode(inner_mode); + family != x->sel.family) goto error; - } - x->inner_mode = inner_mode; + x->inner_mode = *inner_mode; } else { - struct xfrm_mode *inner_mode_iaf; + const struct xfrm_mode *inner_mode_iaf; int iafamily = AF_INET; inner_mode = xfrm_get_mode(x->props.mode, x->props.family); if (inner_mode == NULL) goto error; - if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) { - xfrm_put_mode(inner_mode); + if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) goto error; - } - x->inner_mode = inner_mode; + + x->inner_mode = *inner_mode; if (x->props.family == AF_INET) iafamily = AF_INET6; @@ -2295,9 +2258,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily); if (inner_mode_iaf) { if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL) - x->inner_mode_iaf = inner_mode_iaf; - else - xfrm_put_mode(inner_mode_iaf); + x->inner_mode_iaf = *inner_mode_iaf; } } @@ -2311,12 +2272,13 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) if (err) goto error; - x->outer_mode = xfrm_get_mode(x->props.mode, family); - if (x->outer_mode == NULL) { + outer_mode = xfrm_get_mode(x->props.mode, family); + if (!outer_mode) { err = -EPROTONOSUPPORT; goto error; } + x->outer_mode = *outer_mode; if (init_replay) { err = xfrm_init_replay(x); if (err) @@ -2384,7 +2346,7 @@ void xfrm_state_fini(struct net *net) flush_work(&net->xfrm.state_hash_work); flush_work(&xfrm_state_gc_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); + xfrm_state_flush(net, 0, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a131f9ff979e..eb8d14389601 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1006,8 +1006,8 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) u8 proto = 0; int err; - err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, - cb->extack); + err = nlmsg_parse_deprecated(cb->nlh, 0, attrs, XFRMA_MAX, + xfrma_policy, cb->extack); if (err < 0) return err; @@ -1424,7 +1424,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) ret = verify_policy_dir(p->dir); if (ret) return ret; - if (p->index && ((p->index & XFRM_POLICY_MAX) != p->dir)) + if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) return -EINVAL; return 0; @@ -1513,20 +1513,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) return -EINVAL; } - switch (ut[i].id.proto) { - case IPPROTO_AH: - case IPPROTO_ESP: - case IPPROTO_COMP: -#if IS_ENABLED(CONFIG_IPV6) - case IPPROTO_ROUTING: - case IPPROTO_DSTOPTS: -#endif - case IPSEC_PROTO_ANY: - break; - default: + if (!xfrm_id_proto_valid(ut[i].id.proto)) return -EINVAL; - } - } return 0; @@ -2656,9 +2644,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } } - err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, - link->nla_max ? : XFRMA_MAX, - link->nla_pol ? : xfrma_policy, extack); + err = nlmsg_parse_deprecated(nlh, xfrm_msg_min[type], attrs, + link->nla_max ? : XFRMA_MAX, + link->nla_pol ? : xfrma_policy, extack); if (err < 0) return err; |