diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 16:27:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 16:27:18 -0700 |
commit | cb8e59cc87201af93dfbb6c3dccc8fcad72a09c2 (patch) | |
tree | a334db9022f89654b777bbce8c4c6632e65b9031 /net/core | |
parent | 2e63f6ce7ed2c4ff83ba30ad9ccad422289a6c63 (diff) | |
parent | 065fcfd49763ec71ae345bb5c5a74f961031e70e (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Allow setting bluetooth L2CAP modes via socket option, from Luiz
Augusto von Dentz.
2) Add GSO partial support to igc, from Sasha Neftin.
3) Several cleanups and improvements to r8169 from Heiner Kallweit.
4) Add IF_OPER_TESTING link state and use it when ethtool triggers a
device self-test. From Andrew Lunn.
5) Start moving away from custom driver versions, use the globally
defined kernel version instead, from Leon Romanovsky.
6) Support GRO vis gro_cells in DSA layer, from Alexander Lobakin.
7) Allow hard IRQ deferral during NAPI, from Eric Dumazet.
8) Add sriov and vf support to hinic, from Luo bin.
9) Support Media Redundancy Protocol (MRP) in the bridging code, from
Horatiu Vultur.
10) Support netmap in the nft_nat code, from Pablo Neira Ayuso.
11) Allow UDPv6 encapsulation of ESP in the ipsec code, from Sabrina
Dubroca. Also add ipv6 support for espintcp.
12) Lots of ReST conversions of the networking documentation, from Mauro
Carvalho Chehab.
13) Support configuration of ethtool rxnfc flows in bcmgenet driver,
from Doug Berger.
14) Allow to dump cgroup id and filter by it in inet_diag code, from
Dmitry Yakunin.
15) Add infrastructure to export netlink attribute policies to
userspace, from Johannes Berg.
16) Several optimizations to sch_fq scheduler, from Eric Dumazet.
17) Fallback to the default qdisc if qdisc init fails because otherwise
a packet scheduler init failure will make a device inoperative. From
Jesper Dangaard Brouer.
18) Several RISCV bpf jit optimizations, from Luke Nelson.
19) Correct the return type of the ->ndo_start_xmit() method in several
drivers, it's netdev_tx_t but many drivers were using
'int'. From Yunjian Wang.
20) Add an ethtool interface for PHY master/slave config, from Oleksij
Rempel.
21) Add BPF iterators, from Yonghang Song.
22) Add cable test infrastructure, including ethool interfaces, from
Andrew Lunn. Marvell PHY driver is the first to support this
facility.
23) Remove zero-length arrays all over, from Gustavo A. R. Silva.
24) Calculate and maintain an explicit frame size in XDP, from Jesper
Dangaard Brouer.
25) Add CAP_BPF, from Alexei Starovoitov.
26) Support terse dumps in the packet scheduler, from Vlad Buslov.
27) Support XDP_TX bulking in dpaa2 driver, from Ioana Ciornei.
28) Add devm_register_netdev(), from Bartosz Golaszewski.
29) Minimize qdisc resets, from Cong Wang.
30) Get rid of kernel_getsockopt and kernel_setsockopt in order to
eliminate set_fs/get_fs calls. From Christoph Hellwig.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2517 commits)
selftests: net: ip_defrag: ignore EPERM
net_failover: fixed rollback in net_failover_open()
Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv"
Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv"
vmxnet3: allow rx flow hash ops only when rss is enabled
hinic: add set_channels ethtool_ops support
selftests/bpf: Add a default $(CXX) value
tools/bpf: Don't use $(COMPILE.c)
bpf, selftests: Use bpf_probe_read_kernel
s390/bpf: Use bcr 0,%0 as tail call nop filler
s390/bpf: Maintain 8-byte stack alignment
selftests/bpf: Fix verifier test
selftests/bpf: Fix sample_cnt shared between two threads
bpf, selftests: Adapt cls_redirect to call csum_level helper
bpf: Add csum_level helper for fixing up csum levels
bpf: Fix up bpf_skb_adjust_room helper's skb csum setting
sfc: add missing annotation for efx_ef10_try_update_nic_stats_vf()
crypto/chtls: IPv6 support for inline TLS
Crypto/chcr: Fixes a coccinile check error
Crypto/chcr: Fixes compilations warnings
...
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/bpf_sk_storage.c | 4 | ||||
-rw-r--r-- | net/core/dev.c | 178 | ||||
-rw-r--r-- | net/core/devlink.c | 210 | ||||
-rw-r--r-- | net/core/dst.c | 8 | ||||
-rw-r--r-- | net/core/filter.c | 464 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 173 | ||||
-rw-r--r-- | net/core/flow_offload.c | 305 | ||||
-rw-r--r-- | net/core/gen_stats.c | 2 | ||||
-rw-r--r-- | net/core/link_watch.c | 12 | ||||
-rw-r--r-- | net/core/neighbour.c | 30 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 45 | ||||
-rw-r--r-- | net/core/netpoll.c | 38 | ||||
-rw-r--r-- | net/core/pktgen.c | 2 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 11 | ||||
-rw-r--r-- | net/core/scm.c | 141 | ||||
-rw-r--r-- | net/core/skbuff.c | 12 | ||||
-rw-r--r-- | net/core/skmsg.c | 98 | ||||
-rw-r--r-- | net/core/sock.c | 219 | ||||
-rw-r--r-- | net/core/sock_map.c | 18 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 31 | ||||
-rw-r--r-- | net/core/xdp.c | 59 |
21 files changed, 1275 insertions, 785 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/dev.c b/net/core/dev.c index 2d8aceee4284..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -4549,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4572,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ @@ -5350,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -6237,7 +6319,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6249,20 +6332,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; + if (work_done) { + if (n->gro_bitmask) + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + if (timeout) + ret = false; + } if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) - timeout = n->dev->gro_flush_timeout; - /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6294,7 +6380,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6474,7 +6563,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -7796,6 +7885,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -8736,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); @@ -9148,6 +9265,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else @@ -9208,7 +9330,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9377,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9943,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +10032,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/core/devlink.c b/net/core/devlink.c index 899edcee7dab..2cafbc808b09 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3716,24 +3716,26 @@ nla_put_failure: return err; } -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) +static struct sk_buff * +devlink_nl_region_notify_build(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return ERR_PTR(-ENOMEM); - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); + if (!hdr) { + err = -EMSGSIZE; goto out_free_msg; + } err = devlink_nl_put_handle(msg, devlink); if (err) @@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region, } genlmsg_end(msg, hdr); - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - - return; + return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); + return ERR_PTR(err); +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); + if (IS_ERR(msg)) + return; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -4069,6 +4086,8 @@ static int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; u32 snapshot_id; @@ -4080,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); - return -EINVAL; - } - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); if (!region) { @@ -4102,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -ENOSPC; } - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; - } + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - return err; + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + return err; + } + } err = region->ops->snapshot(devlink, info->extack, &data); if (err) @@ -4121,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_snapshot_create; + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) + return -EINVAL; + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + return 0; err_snapshot_create: @@ -4128,6 +4172,10 @@ err_snapshot_create: err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); + return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, @@ -4167,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct nlattr **attrs, u64 start_offset, u64 end_offset, - bool dump, u64 *new_offset) { struct devlink_snapshot *snapshot; @@ -4182,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > region->size || dump) - end_offset = region->size; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -4212,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - u64 ret_offset, start_offset, end_offset = 0; + u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - bool dump = true; void *hdr; int err; @@ -4246,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + /* return 0 if there is no further data to read */ - if (start_offset >= region->size) { + if (start_offset == end_offset) { err = 0; goto out_unlock; } @@ -4274,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - dump = false; - - if (start_offset == end_offset) { - err = 0; - goto nla_put_failure; - } - } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, region, attrs, start_offset, - end_offset, dump, - &ret_offset); + end_offset, &ret_offset); if (err && err != -EMSGSIZE) goto nla_put_failure; @@ -5830,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: @@ -8455,6 +8495,50 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8466,9 +8550,28 @@ static const struct devlink_trap devlink_trap_generic[] = { static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) @@ -8806,6 +8909,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); diff --git a/net/core/dst.c b/net/core/dst.c index 193af526e908..d6b6ced0d451 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/core/filter.c b/net/core/filter.c index 5cc9276f1023..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -2026,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -3124,7 +3147,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3174,7 +3198,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3202,6 +3227,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; @@ -3422,15 +3449,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) + return -EINVAL; + + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); return -EINVAL; + } if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; @@ -4014,16 +4052,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4033,16 +4077,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4051,6 +4091,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4058,6 +4109,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4205,38 +4281,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} +#define SOCKOPT_CC_REINIT (1 << 0) -static const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; + char devname[IFNAMSIZ]; + struct net *net; + int ifindex; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) + if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); @@ -4277,6 +4339,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk_dst_reset(sk); } break; + case SO_BINDTODEVICE: + ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + optlen = min_t(long, optlen, IFNAMSIZ - 1); + strncpy(devname, optval, optlen); + devname[optlen] = 0; + + ifindex = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + ret = -ENODEV; + + net = sock_net(sk); + dev = dev_get_by_name(net, devname); + if (!dev) + break; + ifindex = dev->ifindex; + dev_put(dev); + } + ret = sock_bindtoindex(sk, ifindex, false); +#endif + break; default: ret = -EINVAL; } @@ -4329,7 +4414,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4376,24 +4461,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4459,8 +4534,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -4500,30 +4638,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, true, false); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ @@ -5983,52 +6119,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -6119,6 +6210,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6163,8 +6270,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: @@ -6193,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6213,6 +6336,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -6335,9 +6460,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: @@ -6384,6 +6509,26 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -6504,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6628,7 +6775,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6640,7 +6787,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: @@ -6770,6 +6917,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -6935,6 +7083,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -6990,6 +7145,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7001,6 +7158,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; @@ -7033,6 +7192,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7063,10 +7223,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7197,6 +7353,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -7812,6 +7973,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; @@ -7882,6 +8060,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; @@ -7962,8 +8150,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7998,9 +8186,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): @@ -8531,6 +8721,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; @@ -8786,6 +8982,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5dceed467f64..d02df0b6d0d9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif - -static DEFINE_MUTEX(flow_dissector_mutex); +#include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,54 +69,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; - struct bpf_prog *attached; - struct net *net; - - if (attr->query.query_flags) - return -EINVAL; - - net = get_net_ns_by_fd(attr->query.target_fd); - if (IS_ERR(net)) - return PTR_ERR(net); - - rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); - - put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; -} - -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; - struct net *net; - int ret = 0; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -130,70 +86,29 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(ns->bpf.progs[type])) + return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(init_net.bpf.progs[type])) + return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (attached == prog) { + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (attached == prog) /* The same program cannot be attached twice */ - ret = -EINVAL; - goto out; - } - rcu_assign_pointer(net->flow_dissector_prog, prog); + return -EINVAL; + + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); -out: - mutex_unlock(&flow_dissector_mutex); - return ret; -} - -static int flow_dissector_bpf_prog_detach(struct net *net) -{ - struct bpf_prog *attached; - - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); - return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); - bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); -} - -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) -{ - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); -} - -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, -}; +#endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -480,47 +395,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, void *data, int nhoff, int hlen, + int lse_index, bool *entropy_label) { - struct flow_dissector_key_keyid *key_keyid; - struct mpls_label *hdr, _hdr[2]; - u32 entry, label; + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; - entry = ntohl(hdr[0].entry); + entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); - key_mpls->mpls_label = label; - key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) - >> MPLS_LS_TTL_SHIFT; - key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) - >> MPLS_LS_TC_SHIFT; - key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) - >> MPLS_LS_S_SHIFT; + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); } - if (label == MPLS_LABEL_ENTROPY) { + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); - key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + key_keyid->keyid = cpu_to_be32(label); } - return FLOW_DISSECT_RET_OUT_GOOD; + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret @@ -979,6 +906,8 @@ bool __skb_flow_dissect(const struct net *net, struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -1030,11 +959,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1278,7 +1209,10 @@ proto_again: case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, - nhoff, hlen); + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { @@ -1852,7 +1786,6 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - - return register_pernet_subsys(&flow_dissector_pernet_ops); + return 0; } core_initcall(init_default_flow_dissectors); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e951b743bed3..0cfc35e6be28 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -8,6 +8,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; + int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); @@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) return NULL; rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } @@ -311,240 +317,159 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); -static LIST_HEAD(block_cb_list); +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); -static struct rhashtable indr_setup_block_ht; - -struct flow_indr_block_cb { - struct list_head list; - void *cb_priv; - flow_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -struct flow_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params flow_indr_setup_block_ht_params = { - .key_offset = offsetof(struct flow_indr_block_dev, dev), - .head_offset = offsetof(struct flow_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; + struct rcu_head rcu; }; -static struct flow_indr_block_dev * -flow_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - flow_indr_setup_block_ht_params); -} - -static struct flow_indr_block_dev * -flow_indr_block_dev_get(struct net_device *dev) +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) { - struct flow_indr_block_dev *indr_dev; + struct flow_indr_dev *indr_dev; - indr_dev = flow_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); if (!indr_dev) return NULL; - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); -inc_ref: - indr_dev->refcnt++; return indr_dev; } -static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident) +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { - struct flow_indr_block_cb *indr_block_cb; + struct flow_indr_dev *indr_dev; - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); + list_add(&indr_dev->list, &flow_block_indr_dev_list); + mutex_unlock(&flow_indr_block_lock); - return indr_block_cb; + return 0; } +EXPORT_SYMBOL(flow_indr_dev_register); -static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) +static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, + struct list_head *cleanup_list) { - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - -static DEFINE_MUTEX(flow_indr_block_cb_lock); + struct flow_block_cb *this, *next; -static void flow_block_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command) -{ - struct flow_indr_block_entry *entry; - - mutex_lock(&flow_indr_block_cb_lock); - list_for_each_entry(entry, &block_cb_list, list) { - entry->cb(dev, cb, cb_priv, command); + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->cb == setup_cb && + this->cb_priv == cb_priv) { + list_move(&this->indr.list, cleanup_list); + return; + } } - mutex_unlock(&flow_indr_block_cb_lock); } -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +static void flow_block_indr_notify(struct list_head *cleanup_list) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - int err; - - indr_dev = flow_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; - - indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); - - return 0; + struct flow_block_cb *this, *next; -err_dev_put: - flow_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - int err; - - rtnl_lock(); - err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); - - return err; + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } } -EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); return; + } - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); + __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); - flow_indr_block_cb_del(indr_block_cb); - flow_indr_block_dev_put(indr_dev); + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); } -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); +EXPORT_SYMBOL(flow_indr_dev_unregister); -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { - rtnl_lock(); - __flow_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.dev = dev; + flow_block->indr.cleanup = cleanup; } -EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type) +static void __flow_block_indr_binding(struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; + struct flow_block_cb *block_cb; - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); + list_for_each_entry(block_cb, &bo->cb_list, list) { + switch (bo->command) { + case FLOW_BLOCK_BIND: + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + break; + case FLOW_BLOCK_UNBIND: + list_del(&block_cb->indr.list); + break; + } + } } -EXPORT_SYMBOL_GPL(flow_indr_block_call); -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) { - mutex_lock(&flow_indr_block_cb_lock); - list_add_tail(&entry->list, &block_cb_list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); + struct flow_indr_dev *this; -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_del(&entry->list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(this, &flow_block_indr_dev_list, list) + this->cb(dev, this->cb_priv, type, bo); -static int __init init_flow_indr_rhashtable(void) -{ - return rhashtable_init(&indr_setup_block_ht, - &flow_indr_setup_block_ht_params); + __flow_block_indr_binding(bo, dev, data, cleanup); + mutex_unlock(&flow_indr_block_lock); + + return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; } -subsys_initcall(init_flow_indr_rhashtable); +EXPORT_SYMBOL(flow_indr_dev_setup_offload); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include <linux/types.h> diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dbe0c6ead773..ef6b5a8f629c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3379,7 +3381,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3445,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3459,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3469,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3480,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3490,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3500,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3510,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4773ad6ec111..e353b822bb15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev, } static DEVICE_ATTR_RO(duplex); +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,7 +272,7 @@ static const char *const operstates[] = { "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; @@ -355,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - dev->gro_flush_timeout = val; + WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } @@ -370,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + WRITE_ONCE(dev->napi_defer_hard_irqs, val); + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -524,6 +553,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, + &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, @@ -532,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, @@ -1774,12 +1805,12 @@ static struct class net_class __ro_after_init = { #ifdef CONFIG_OF_NET static int of_dev_node_match(struct device *dev, const void *data) { - int ret = 0; - - if (dev->parent) - ret = dev->parent->of_node == data; + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } - return ret == 0 ? dev->of_node == data : ret; + return 0; } /* diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..093e90e52bc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -304,20 +305,22 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); - npinfo = rcu_dereference_bh(np->dev->npinfo); + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return; + return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ @@ -356,8 +359,25 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + return NETDEV_TX_OK; +} + +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + netdev_tx_t ret; + + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } + return ret; } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); +EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 08e2811b5274..b53b6d38c4df 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -56,7 +56,7 @@ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) @@ -3990,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) diff --git a/net/core/scm.c b/net/core/scm.c index dc6fed1f221c..875df1c2989d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + struct cmsghdr cmhdr; + + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || + copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) + return -EFAULT; + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; } EXPORT_SYMBOL(put_cmsg); @@ -277,78 +280,90 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); +static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) +{ + struct socket *sock; + int new_fd; + int error; + + error = security_file_receive(file); + if (error) + return error; + + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } + + /* Bump the usage count and install the file. */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } + fd_install(new_fd, get_file(file)); + return 0; +} + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user*)msg->msg_control; - - int fdmax = 0; - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; + int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; - if (MSG_CMSG_COMPAT & msg->msg_flags) { + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - if (msg->msg_controllen > sizeof(struct cmsghdr)) - fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) - / sizeof(int)); - - if (fdnum < fdmax) - fdmax = fdnum; + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; - i++, cmfptr++) - { - struct socket *sock; - int new_fd; - err = security_file_receive(fp[i]); + for (i = 0; i < fdmax; i++) { + err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags - ? O_CLOEXEC : 0); - if (err < 0) - break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - sock = sock_from_file(fp[i], &err); - if (sock) { - sock_update_netprioidx(&sock->sk->sk_cgrp_data); - sock_update_classid(&sock->sk->sk_cgrp_data); - } - fd_install(new_fd, get_file(fp[i])); } - if (i > 0) - { - int cmlen = CMSG_LEN(i*sizeof(int)); + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { - cmlen = CMSG_SPACE(i*sizeof(int)); + cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } } - if (i < fdnum || (fdnum && fdmax <= 0)) + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7e29590482ce..b8afefe6f6b6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); @@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive_list); /** * skb_segment - Perform protocol segmentation on skb. @@ -4191,7 +4190,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 @@ -6087,13 +6085,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6188,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c479372f2cd2..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -682,13 +683,75 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; bool ingress; + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) { + kfree_skb(skb); + return; + } + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + kfree_skb(skb); + return; + } + + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + } else { + kfree_skb(skb); + } +} + +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sock *sk_other; + switch (verdict) { case __SK_PASS: sk_other = psock->sk; @@ -707,25 +770,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ + sk_psock_skb_redirect(psock, skb); + break; case __SK_DROP: /* fall-through */ default: @@ -779,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index b714162213ae..6c4acf1f0220 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,20 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) +{ + int ret; + + if (lock_sk) + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + if (lock_sk) + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index, true); out: #endif @@ -712,6 +723,111 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -808,30 +924,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -843,9 +936,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) @@ -903,28 +995,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ @@ -1152,27 +1233,35 @@ set_rcvbuf: break; case SO_TXTIME: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - } else if (optlen != sizeof(struct sock_txtime)) { + if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; + break; } else if (copy_from_user(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; + break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; - } else { - sock_valbool_flag(sk, SOCK_TXTIME, true); - sk->sk_clockid = sk_txtime.clockid; - sk->sk_txtime_deadline_mode = - !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); - sk->sk_txtime_report_errors = - !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: @@ -3625,3 +3714,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b08dfae10f88..00a26cf2cfe9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return __sock_map_lookup_elem(map, *(u32 *)key); + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) @@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) static void *sock_hash_lookup(struct bpf_map *map, void *key) { - return __sock_hash_lookup_elem(map, key); + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void sock_hash_release_progs(struct bpf_map *map) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..b109cc8a6dd8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include <net/pkt_sched.h> static int two __maybe_unused = 2; +static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -39,13 +40,14 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +117,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +181,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +192,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +214,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +233,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +248,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +261,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -560,7 +555,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "high_order_alloc_disable", diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,11 +11,13 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/bug.h> #include <net/page_pool.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -108,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -142,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -300,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -357,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -382,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -492,7 +464,14 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); |