diff options
author | David S. Miller <davem@davemloft.net> | 2021-06-17 11:54:56 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2021-06-17 11:54:56 -0700 |
commit | a52171ae7b803f4587b8172d1768313b4d093d0a (patch) | |
tree | b7504137cddb40533c047a6effd024bb0ba2434f /include | |
parent | 4de772511fd13aa5e7b9bf485ce26f87e6de2bb8 (diff) | |
parent | f20792d425d2efd2680f2855c1e3fec01c2e569e (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2021-06-17
The following pull-request contains BPF updates for your *net-next* tree.
We've added 50 non-merge commits during the last 25 day(s) which contain
a total of 148 files changed, 4779 insertions(+), 1248 deletions(-).
The main changes are:
1) BPF infrastructure to migrate TCP child sockets from a listener to another
in the same reuseport group/map, from Kuniyuki Iwashima.
2) Add a provably sound, faster and more precise algorithm for tnum_mul() as
noted in https://arxiv.org/abs/2105.05398, from Harishankar Vishwanathan.
3) Streamline error reporting changes in libbpf as planned out in the
'libbpf: the road to v1.0' effort, from Andrii Nakryiko.
4) Add broadcast support to xdp_redirect_map(), from Hangbin Liu.
5) Extends bpf_map_lookup_and_delete_elem() functionality to 4 more map
types, that is, {LRU_,PERCPU_,LRU_PERCPU_,}HASH, from Denis Salopek.
6) Support new LLVM relocations in libbpf to make them more linker friendly,
also add a doc to describe the BPF backend relocations, from Yonghong Song.
7) Silence long standing KUBSAN complaints on register-based shifts in
interpreter, from Daniel Borkmann and Eric Biggers.
8) Add dummy PT_REGS macros in libbpf to fail BPF program compilation when
target arch cannot be determined, from Lorenz Bauer.
9) Extend AF_XDP to support large umems with 1M+ pages, from Magnus Karlsson.
10) Fix two minor libbpf tc BPF API issues, from Kumar Kartikeya Dwivedi.
11) Move libbpf BPF_SEQ_PRINTF/BPF_SNPRINTF macros that can be used by BPF
programs to bpf_helpers.h header, from Florent Revest.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/bpf.h | 23 | ||||
-rw-r--r-- | include/linux/bpf_local_storage.h | 4 | ||||
-rw-r--r-- | include/linux/filter.h | 21 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 1 | ||||
-rw-r--r-- | include/net/sock_reuseport.h | 9 | ||||
-rw-r--r-- | include/net/xdp.h | 1 | ||||
-rw-r--r-- | include/trace/events/xdp.h | 6 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 43 |
8 files changed, 97 insertions, 11 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dc44ba97584..f309fc1509f2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -70,6 +70,8 @@ struct bpf_map_ops { void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); + int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key, + void *value, u64 flags); int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -1499,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress); bool dev_map_can_have_prog(struct bpf_map *map); void __cpu_map_flush(void); @@ -1668,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return 0; } +static inline +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, + struct bpf_map *map, bool exclude_ingress) +{ + return 0; +} + struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, @@ -1677,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } +static inline +int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog, struct bpf_map *map, + bool exclude_ingress) +{ + return 0; +} + static inline void __cpu_map_flush(void) { } @@ -2026,6 +2048,7 @@ struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; + struct sock *migrating_sk; void *data_end; u32 hash; u32 reuseport_id; diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index b902c580c48d..24496bc28e7b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -58,7 +58,7 @@ struct bpf_local_storage_data { * from the object's bpf_local_storage. * * Put it in the same cacheline as the data to minimize - * the number of cachelines access during the cache hit case. + * the number of cachelines accessed during the cache hit case. */ struct bpf_local_storage_map __rcu *smap; u8 data[] __aligned(8); @@ -71,7 +71,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; /* 8 bytes hole */ - /* The data is stored in aother cacheline to minimize + /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ struct bpf_local_storage_data sdata ____cacheline_aligned; diff --git a/include/linux/filter.h b/include/linux/filter.h index 9a09547bc7ba..688856e0b28a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,6 +646,7 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; + struct bpf_map *map; u32 map_id; enum bpf_map_type map_type; u32 kern_flags; @@ -995,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, + struct sock *migrating_sk, u32 hash) { return NULL; @@ -1464,17 +1467,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, + u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) + if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { + if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes @@ -1482,13 +1487,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; - return flags; + return flags & action_mask; } ri->tgt_index = ifindex; ri->map_id = map->id; ri->map_type = map->map_type; + if (flags & BPF_F_BROADCAST) { + WRITE_ONCE(ri->map, map); + ri->flags = flags; + } else { + WRITE_ONCE(ri->map, NULL); + ri->flags = 0; + } + return XDP_REDIRECT; } diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 746c80cd4257..b8620519eace 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -126,6 +126,7 @@ struct netns_ipv4 { u8 sysctl_tcp_syn_retries; u8 sysctl_tcp_synack_retries; u8 sysctl_tcp_syncookies; + u8 sysctl_tcp_migrate_req; int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 505f1e18e9bf..473b0b0fa4ab 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock; struct sock_reuseport { struct rcu_head rcu; - u16 max_socks; /* length of socks */ - u16 num_socks; /* elements in socks */ + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + u16 num_closed_socks; /* closed elements in socks */ /* The last synq overflow event timestamp of this * reuse->socks[] group. */ @@ -31,10 +32,14 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany); extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); +void reuseport_stop_listen_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); +struct sock *reuseport_migrate_sock(struct sock *sk, + struct sock *migrating_sk, + struct sk_buff *skb); extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); diff --git a/include/net/xdp.h b/include/net/xdp.h index a5bc214a49d9..5533f0ab2afc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index fcad3645a70b..c40fc97f9417 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + /* Just leave to_ifindex to 0 if do broadcast redirect, + * as tgt will be NULL. + */ + if (tgt) + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 418b9b813d65..bf9252c7381e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -527,6 +527,15 @@ union bpf_iter_link_info { * Look up an element with the given *key* in the map referred to * by the file descriptor *fd*, and if found, delete the element. * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types * implement this command as a "pop" operation, deleting the top * element rather than one corresponding to *key*. @@ -536,6 +545,10 @@ union bpf_iter_link_info { * This command is only valid for the following map types: * * **BPF_MAP_TYPE_QUEUE** * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -981,6 +994,8 @@ enum bpf_attach_type { BPF_SK_LOOKUP, BPF_XDP, BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, __MAX_BPF_ATTACH_TYPE }; @@ -2542,8 +2557,12 @@ union bpf_attr { * The lower two bits of *flags* are used as the return code if * the map lookup fails. This is so that the return value can be * one of the XDP program return codes up to **XDP_TX**, as chosen - * by the caller. Any higher bits in the *flags* argument must be - * unset. + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. * * See also **bpf_redirect**\ (), which only supports redirecting * to an ifindex, but doesn't require a map to do so. @@ -5109,6 +5128,12 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ @@ -5393,6 +5418,20 @@ struct sk_reuseport_md { __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ __u32 bind_inany; /* Is sock bound to an INANY address? */ __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); }; #define BPF_TAG_SIZE 8 |