diff options
author | Jens Axboe <axboe@kernel.dk> | 2024-05-11 08:25:55 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2024-05-11 08:25:55 -0600 |
commit | fe6532b44af402d0900c5be3e5359f4b293524b1 (patch) | |
tree | 6cd3f9e365f14126b69979b85df2cf458f4bb23b /net | |
parent | ad1978dbbd827c1a1a7d22d9cc9ba71989dae48a (diff) | |
parent | cddd2dc6390b90e62cec2768424d1d90f6d04161 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next into net-accept-more
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1557 commits)
net: qede: use extack in qede_parse_actions()
net: qede: propagate extack through qede_flow_spec_validate()
net: qede: use faked extack in qede_flow_spec_to_rule()
net: qede: use extack in qede_parse_flow_attr()
net: qede: add extack in qede_add_tc_flower_fltr()
net: qede: use extack in qede_flow_parse_udp_v4()
net: qede: use extack in qede_flow_parse_udp_v6()
net: qede: use extack in qede_flow_parse_tcp_v4()
net: qede: use extack in qede_flow_parse_tcp_v6()
net: qede: use extack in qede_flow_parse_v4_common()
net: qede: use extack in qede_flow_parse_v6_common()
net: qede: use extack in qede_set_v4_tuple_to_profile()
net: qede: use extack in qede_set_v6_tuple_to_profile()
net: qede: use extack in qede_flow_parse_ports()
net: usb: smsc95xx: stop lying about skb->truesize
net: dsa: microchip: Fix spellig mistake "configur" -> "configure"
af_unix: Add dead flag to struct scm_fp_list.
net: ethernet: adi: adin1110: Replace linux/gpio.h by proper one
octeontx2-pf: Reuse Transmit queue/Send queue index of HTB class
gve: Use ethtool_sprintf/puts() to fill stats strings
...
Diffstat (limited to 'net')
289 files changed, 6403 insertions, 2819 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 39876eff51d2..3efba4f857ac 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -149,7 +149,7 @@ static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) if (max_mtu < new_mtu) return -ERANGE; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index a3b68243fd4b..cf5219df7903 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -117,17 +117,15 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) diff --git a/net/Kconfig b/net/Kconfig index 3e57ccf0da27..f0a8692496ff 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -60,6 +60,9 @@ config NET_XGRESS config NET_REDIRECT bool +config SKB_DECRYPTED + bool + config SKB_EXTENSIONS bool @@ -449,6 +452,9 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_IEEE8021Q_HELPERS + bool + config NET_SELFTESTS def_tristate PHYLIB depends on PHYLIB && INET diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c index d945b7c0176d..7aebfe903242 100644 --- a/net/appletalk/sysctl_net_atalk.c +++ b/net/appletalk/sysctl_net_atalk.c @@ -40,7 +40,6 @@ static struct ctl_table atalk_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; static struct ctl_table_header *atalk_table_header; diff --git a/net/atm/clip.c b/net/atm/clip.c index 294cb9efe3d3..42b910cb4e8e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -345,7 +345,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, dev->stats.tx_dropped++; return NETDEV_TX_OK; } - rt = (struct rtable *) dst; + rt = dst_rtable(dst); if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else @@ -463,7 +463,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) unlink_clip_vcc(clip_vcc); return 0; } - rt = ip_route_output(&init_net, ip, 0, 1, 0); + rt = ip_route_output(&init_net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); diff --git a/net/atm/common.c b/net/atm/common.c index 2a1ec014e901..9b75699992ff 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index fdb666607f10..e23a3dc14b93 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -4,7 +4,7 @@ # menuconfig HAMRADIO - depends on NET && !S390 + depends on NET bool "Amateur Radio support" help If you want to connect your Linux box to an amateur radio, answer Y diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 282ec581c072..0bc682ffae9c 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -78,7 +78,10 @@ void ax25_dev_device_up(struct net_device *dev) ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; + +#ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; +#endif #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index db66e11e7fe8..68753aa30334 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -141,8 +141,6 @@ static const struct ctl_table ax25_param_table[] = { .extra2 = &max_ds_timeout }, #endif - - { } /* that's all, folks! */ }; int ax25_register_dev_sysctl(ax25_dev *ax25_dev) @@ -155,6 +153,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) if (!table) return -ENOMEM; + BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES); for (k = 0; k < AX25_MAX_VALUES; k++) table[k].data = &ax25_dev->values[k]; @@ -171,7 +170,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) { struct ctl_table_header *header = ax25_dev->sysheader; - struct ctl_table *table; + const struct ctl_table *table; if (header) { ax25_dev->sysheader = NULL; diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 75119f1ffccc..8e0f44c71696 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -14,7 +14,6 @@ #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> -#include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> @@ -38,6 +37,7 @@ #include <linux/string.h> #include <linux/workqueue.h> #include <net/dsfield.h> +#include <net/genetlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8ca854a75a32..3d4c36ae2e1a 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2024.1" +#define BATADV_SOURCE_VERSION "2024.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0954757f0b8b..9362cd9d6f3d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -15,7 +15,6 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> -#include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 71c143d4b6d0..ac74f6ead62d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1266,6 +1266,8 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) /* for all origins... */ for (i = 0; i < hash->size; i++) { head = &hash->table[i]; + if (hlist_empty(head)) + continue; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 89c51b3cf430..30ecbc2ef1fd 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -159,7 +159,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); bat_priv->mtu_set_by_user = new_mtu; return 0; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 2243cec18ecc..b21ff3c36b07 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -209,20 +209,6 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, } /** - * batadv_tt_local_entry_free_rcu() - free the tt_local_entry - * @rcu: rcu pointer of the tt_local_entry - */ -static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_local_entry *tt_local_entry; - - tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, - common.rcu); - - kmem_cache_free(batadv_tl_cache, tt_local_entry); -} - -/** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node @@ -236,7 +222,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) batadv_softif_vlan_put(tt_local_entry->vlan); - call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); + kfree_rcu(tt_local_entry, common.rcu); } /** @@ -255,20 +241,6 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) } /** - * batadv_tt_global_entry_free_rcu() - free the tt_global_entry - * @rcu: rcu pointer of the tt_global_entry - */ -static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_global_entry *tt_global_entry; - - tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, - common.rcu); - - kmem_cache_free(batadv_tg_cache, tt_global_entry); -} - -/** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the nc_node @@ -282,7 +254,7 @@ void batadv_tt_global_entry_release(struct kref *ref) batadv_tt_global_del_orig_list(tt_global_entry); - call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); + kfree_rcu(tt_global_entry, common.rcu); } /** @@ -408,19 +380,6 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, } /** - * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry - * @rcu: rcu pointer of the orig_entry - */ -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - kmem_cache_free(batadv_tt_orig_cache, orig_entry); -} - -/** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry @@ -433,7 +392,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) refcount); batadv_orig_node_put(orig_entry->orig_node); - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + kfree_rcu(orig_entry, rcu); } /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 27520a8a486f..50cfec8ccac4 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -133,7 +133,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int count = atomic_read(&dev->peer_count); const struct in6_addr *nexthop; struct lowpan_peer *peer; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index de33dc1b0daa..891cdf61c65a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -79,6 +79,51 @@ static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) args->args[3], args->args[4]); } +static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset) +{ + int i; + + for (i = 0; i < aux->ctx_arg_info_size; i++) + if (aux->ctx_arg_info[i].offset == offset) + return &aux->ctx_arg_info[i]; + + return NULL; +} + +/* There is only one check at the moment: + * - zero should not be passed for pointer parameters not marked as nullable. + */ +static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args) +{ + const struct btf_type *func_proto = prog->aux->attach_func_proto; + + for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) { + const struct btf_param *param = &btf_params(func_proto)[arg_no]; + const struct bpf_ctx_arg_aux *info; + const struct btf_type *t; + int offset; + + if (args->args[arg_no] != 0) + continue; + + /* Program is validated already, so there is no need + * to check if t is NULL. + */ + t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL); + if (!btf_type_is_ptr(t)) + continue; + + offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no); + info = find_ctx_arg_info(prog->aux, offset); + if (info && (info->reg_type & PTR_MAYBE_NULL)) + continue; + + return -EINVAL; + } + + return 0; +} + extern const struct bpf_link_ops bpf_struct_ops_link_lops; int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, @@ -87,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_links *tlinks = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; @@ -109,6 +154,10 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (IS_ERR(args)) return PTR_ERR(args); + err = check_test_run_args(prog, args); + if (err) + goto out; + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) { err = -ENOMEM; @@ -133,7 +182,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err < 0) goto out; - arch_protect_bpf_trampoline(image, PAGE_SIZE); + err = arch_protect_bpf_trampoline(image, PAGE_SIZE); + if (err) + goto out; prog_ret = dummy_ops_call_op(image, args); err = dummy_ops_copy_args(args); @@ -230,7 +281,7 @@ static void bpf_dummy_unreg(void *kdata) { } -static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb) +static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable) { return 0; } @@ -247,7 +298,7 @@ static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb) } static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { - .test_1 = bpf_dummy_test_1, + .test_1 = bpf_dummy_ops__test_1, .test_2 = bpf_dummy_test_2, .test_sleepable = bpf_dummy_test_sleepable, }; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 61efeadaff8d..f6aad4ed2ab2 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, return a + *b + c + d + (long)e + f + g; } +__bpf_kfunc int bpf_modify_return_test_tp(int nonce) +{ + trace_bpf_trigger_tp(nonce); + + return nonce; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -622,6 +629,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) +BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c366ccc8b3db..71b50001ca58 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -197,7 +197,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); /* this flag will be cleared if the MTU was automatically adjusted */ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true); @@ -389,7 +389,7 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx, br_vlan_fill_forward_path_pvid(br, ctx, path); f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id); - if (!f || !f->dst) + if (!f) return -1; dst = READ_ONCE(f->dst); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 22e35623c148..bf30c50b5689 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -399,7 +399,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ goto free_skb; rt = ip_route_output(net, iph->daddr, 0, - RT_TOS(iph->tos), 0); + RT_TOS(iph->tos), 0, + RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ @@ -1225,7 +1226,6 @@ static struct ctl_table brnf_table[] = { .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, - { } }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) @@ -1274,7 +1274,7 @@ static int br_netfilter_sysctl_init_net(struct net *net) static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { - struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; + const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 81833ca7a2c7..a966a6ec8263 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -65,13 +65,14 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err; if (metadata) return -EEXIST; - metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, - key, 0); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0); if (!metadata) return -EINVAL; @@ -185,6 +186,7 @@ void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; @@ -202,7 +204,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return err; if (BR_INPUT_SKB_CB(skb)->backup_nhid) { - tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, tunnel_id, 0); if (!tunnel_dst) return -ENOMEM; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 8480684f2762..20139fa1be1f 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -201,14 +201,14 @@ int cfctrl_linkup_request(struct cflayer *layer, struct cflayer *user_layer) { struct cfctrl *cfctrl = container_obj(layer); + struct cflayer *dn = cfctrl->serv.layer.dn; + char utility_name[UTILITY_NAME_LENGTH]; + struct cfctrl_request_info *req; + struct cfpkt *pkt; u32 tmp32; u16 tmp16; u8 tmp8; - struct cfctrl_request_info *req; int ret; - char utility_name[16]; - struct cfpkt *pkt; - struct cflayer *dn = cfctrl->serv.layer.dn; if (!dn) { pr_debug("not able to send linkup request\n"); diff --git a/net/core/Makefile b/net/core/Makefile index 6e6548011fae..62be9aef2528 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o @@ -41,4 +42,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o -obj-$(CONFIG_NET_TEST) += gso_test.o +obj-$(CONFIG_NET_TEST) += net_test.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d46..bc01b3aa6b0f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { - if (nla_len(nla) != sizeof(u32)) - return ERR_PTR(-EINVAL); - nr_maps++; - } + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/datagram.c b/net/core/datagram.c index a8b625abe242..e614cfd8e14a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) -{ - bool slow; - - if (!skb_unref(skb)) { - sk_peek_offset_bwd(sk, len); - return; - } - - slow = lock_sock_fast(sk); - sk_peek_offset_bwd(sk, len); - skb_orphan(skb); - unlock_sock_fast(sk, slow); - - /* skb is now orphaned, can be freed outside of locked section */ - __kfree_skb(skb); -} -EXPORT_SYMBOL(__skb_free_datagram_locked); - int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, diff --git a/net/core/dev.c b/net/core/dev.c index 331848eca7d3..d2ce91a334c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -77,7 +77,9 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/isolation.h> #include <linux/sched/mm.h> +#include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> @@ -156,6 +158,7 @@ #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> +#include <linux/phy_link_topology_core.h> #include "dev.h" #include "net-sysfs.h" @@ -197,35 +200,60 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock_irqsave(struct softnet_data *sd, - unsigned long *flags) +#ifndef CONFIG_PREEMPT_RT + +static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); + +static int __init setup_backlog_napi_threads(char *arg) +{ + static_branch_enable(&use_backlog_threads_key); + return 0; +} +early_param("thread_backlog_napi", setup_backlog_napi_threads); + +static bool use_backlog_threads(void) { - if (IS_ENABLED(CONFIG_RPS)) + return static_branch_unlikely(&use_backlog_threads_key); +} + +#else + +static bool use_backlog_threads(void) +{ + return true; +} + +#endif + +static inline void backlog_lock_irq_save(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_save(*flags); } -static inline void rps_lock_irq_disable(struct softnet_data *sd) +static inline void backlog_lock_irq_disable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); } -static inline void rps_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) +static inline void backlog_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_restore(*flags); } -static inline void rps_unlock_irq_enable(struct softnet_data *sd) +static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { - if (IS_ENABLED(CONFIG_RPS)) + if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); @@ -912,6 +940,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) } EXPORT_SYMBOL(dev_get_by_napi_id); +static DEFINE_SEQLOCK(netdev_rename_lock); + +void netdev_copy_name(struct net_device *dev, char *name) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&netdev_rename_lock); + strscpy(name, dev->name, IFNAMSIZ); + } while (read_seqretry(&netdev_rename_lock, seq)); +} + /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace @@ -923,7 +963,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex) struct net_device *dev; int ret; - down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); @@ -932,12 +971,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex) goto out; } - strcpy(name, dev->name); + netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); - up_read(&devnet_rename_sem); return ret; } @@ -1189,7 +1227,10 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); + write_seqlock(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); + write_sequnlock(&netdev_rename_lock); + if (err < 0) { up_write(&devnet_rename_sem); return err; @@ -1229,7 +1270,9 @@ rollback: if (err >= 0) { err = ret; down_write(&devnet_rename_sem); + write_seqlock(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; @@ -2057,6 +2100,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -3917,6 +3965,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; + if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { + if (tcf_block_bypass_sw(miniq->block)) + return ret; + } + tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); @@ -4410,8 +4463,8 @@ EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ +static DEFINE_PER_CPU(struct task_struct *, backlog_napi); -unsigned int sysctl_skb_defer_max __read_mostly = 64; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -4433,18 +4486,16 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ thread = READ_ONCE(napi->thread); if (thread) { - /* Avoid doing set_bit() if the thread is in - * INTERRUPTIBLE state, cause napi_thread_wait() - * makes sure to proceed with napi polling - * if the thread is explicitly woken from here. - */ - if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE) - set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); + if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) + goto use_local_napi; + + set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } +use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() @@ -4470,7 +4521,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; - u32 flow_id; + u32 flow_id, head; u16 rxq_index; int rc; @@ -4493,16 +4544,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->filter = rc; - if (old_rflow->filter == rflow->filter) - old_rflow->filter = RPS_NO_FILTER; + WRITE_ONCE(rflow->filter, rc); + if (old_rflow->filter == rc) + WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif - rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); + rps_input_queue_tail_save(&rflow->last_qtail, head); } - rflow->cpu = next_cpu; + WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } @@ -4581,7 +4632,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4635,9 +4686,9 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4684,6 +4735,11 @@ static void napi_schedule_rps(struct softnet_data *sd) #ifdef CONFIG_RPS if (sd != mysd) { + if (use_backlog_threads()) { + __napi_schedule_irqoff(&sd->backlog); + return; + } + sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4698,6 +4754,23 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +{ + unsigned long flags; + + if (use_backlog_threads()) { + backlog_lock_irq_save(sd, &flags); + + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + __napi_schedule_irqoff(&sd->backlog); + + backlog_unlock_irq_restore(sd, &flags); + + } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { + smp_call_function_single_async(cpu, &sd->defer_csd); + } +} + #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif @@ -4749,37 +4822,45 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; + u32 tail; + + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); - rps_lock_irqsave(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; + backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(net_hotdata.max_backlog) && - !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - rps_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } + __skb_queue_tail(&sd->input_pkt_queue, skb); + tail = rps_input_queue_tail_incr(sd); + backlog_unlock_irq_restore(sd, &flags); - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); + return NET_RX_SUCCESS; } - reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: - sd->dropped++; - rps_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; @@ -5844,21 +5925,21 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -5870,14 +5951,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); return do_flush; #endif @@ -5943,7 +6024,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; - if (remsd) { + if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); @@ -5958,7 +6039,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS - return sd->rps_ipi_list != NULL; + return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif @@ -5986,13 +6067,14 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } } - rps_lock_irq_disable(sd); + backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -6002,15 +6084,17 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock_irq_enable(sd); + backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } @@ -6447,7 +6531,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } } - dev->threaded = threaded; + WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. @@ -6538,7 +6622,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = 0; + dev->threaded = false; netif_napi_set_irq(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight); @@ -6716,8 +6800,6 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static int napi_thread_wait(struct napi_struct *napi) { - bool woken = false; - set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { @@ -6726,15 +6808,13 @@ static int napi_thread_wait(struct napi_struct *napi) * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ - if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) { + if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); - /* woken being true indicates this thread owns this napi. */ - woken = true; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -6742,43 +6822,48 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static int napi_threaded_poll(void *data) +static void napi_threaded_poll_loop(struct napi_struct *napi) { - struct napi_struct *napi = data; struct softnet_data *sd; - void *have; + unsigned long last_qs = jiffies; - while (!napi_thread_wait(napi)) { - unsigned long last_qs = jiffies; - - for (;;) { - bool repoll = false; + for (;;) { + bool repoll = false; + void *have; - local_bh_disable(); - sd = this_cpu_ptr(&softnet_data); - sd->in_napi_threaded_poll = true; + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + sd->in_napi_threaded_poll = true; - have = netpoll_poll_lock(napi); - __napi_poll(napi, &repoll); - netpoll_poll_unlock(have); + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); - sd->in_napi_threaded_poll = false; - barrier(); + sd->in_napi_threaded_poll = false; + barrier(); - if (sd_has_rps_ipi_waiting(sd)) { - local_irq_disable(); - net_rps_action_and_irq_enable(sd); - } - skb_defer_free_flush(sd); - local_bh_enable(); + if (sd_has_rps_ipi_waiting(sd)) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } + skb_defer_free_flush(sd); + local_bh_enable(); - if (!repoll) - break; + if (!repoll) + break; - rcu_softirq_qs_periodic(last_qs); - cond_resched(); - } + rcu_softirq_qs_periodic(last_qs); + cond_resched(); } +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + + while (!napi_thread_wait(napi)) + napi_threaded_poll_loop(napi); + return 0; } @@ -8459,27 +8544,29 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; + unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); - dev->flags |= IFF_PROMISC; - dev->promiscuity += inc; - if (dev->promiscuity == 0) { + promiscuity = dev->promiscuity + inc; + if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ - if (inc < 0) - dev->flags &= ~IFF_PROMISC; - else { - dev->promiscuity -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_PROMISC; + } else { + flags = old_flags | IFF_PROMISC; } - if (dev->flags != old_flags) { + WRITE_ONCE(dev->promiscuity, promiscuity); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { @@ -8530,25 +8617,27 @@ EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; + unsigned int allmulti, flags; ASSERT_RTNL(); - dev->flags |= IFF_ALLMULTI; - dev->allmulti += inc; - if (dev->allmulti == 0) { + allmulti = dev->allmulti + inc; + if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ - if (inc < 0) - dev->flags &= ~IFF_ALLMULTI; - else { - dev->allmulti -= inc; + if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } + flags = old_flags & ~IFF_ALLMULTI; + } else { + flags = old_flags | IFF_ALLMULTI; } - if (dev->flags ^ old_flags) { + WRITE_ONCE(dev->allmulti, allmulti); + if (flags != old_flags) { + WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); @@ -8874,7 +8963,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) return -ERANGE; if (new_len != orig_len) { - dev->tx_queue_len = new_len; + WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) @@ -8888,7 +8977,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; + WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } @@ -9134,7 +9223,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); - dev->proto_down = proto_down; + WRITE_ONCE(dev->proto_down, proto_down); return 0; } @@ -9148,18 +9237,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { + u32 proto_down_reason; int b; if (!mask) { - dev->proto_down_reason = value; + proto_down_reason = value; } else { + proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) - dev->proto_down_reason |= BIT(b); + proto_down_reason |= BIT(b); else - dev->proto_down_reason &= ~BIT(b); + proto_down_reason &= ~BIT(b); } } + WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { @@ -10349,25 +10441,12 @@ err_free_name: } EXPORT_SYMBOL(register_netdevice); -/** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initialize the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. +/* Initialize the core of a dummy net device. + * This is useful if you are calling this function after alloc_netdev(), + * since it does not memset the net_device fields. */ -void init_dummy_netdev(struct net_device *dev) +static void init_dummy_netdev_core(struct net_device *dev) { - /* Clear everything. Note we don't initialize spinlocks - * are they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - /* make sure we BUG if trying to hit standard * register/unregister code path */ @@ -10388,8 +10467,28 @@ void init_dummy_netdev(struct net_device *dev) * its refcount. */ } -EXPORT_SYMBOL_GPL(init_dummy_netdev); +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initializes the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +void init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * as they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + init_dummy_netdev_core(dev); +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); /** * register_netdev - register a network device @@ -10488,8 +10587,9 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) rebroadcast_time = jiffies; } + rcu_barrier(); + if (!wait) { - rcu_barrier(); wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); @@ -10898,6 +10998,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + dev->link_topo = phy_link_topo_create(dev); + if (IS_ERR(dev->link_topo)) { + dev->link_topo = NULL; + goto free_all; + } + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -10986,8 +11092,11 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + phy_link_topo_destroy(dev->link_topo); + /* Compatibility with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { + if (dev->reg_state == NETREG_UNINITIALIZED || + dev->reg_state == NETREG_DUMMY) { netdev_freemem(dev); return; } @@ -11001,6 +11110,19 @@ void free_netdev(struct net_device *dev) EXPORT_SYMBOL(free_netdev); /** + * alloc_netdev_dummy - Allocate and initialize a dummy net device. + * @sizeof_priv: size of private data to allocate space for + * + * Return: the allocated net_device on success, NULL otherwise + */ +struct net_device *alloc_netdev_dummy(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, + init_dummy_netdev_core); +} +EXPORT_SYMBOL_GPL(alloc_netdev_dummy); + +/** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. @@ -11303,8 +11425,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_net_set(dev, net); dev->ifindex = new_ifindex; - if (new_name[0]) /* Rename the netdev to prepared name */ + if (new_name[0]) { + /* Rename the netdev to prepared name */ + write_seqlock(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); + write_sequnlock(&netdev_rename_lock); + } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); @@ -11379,7 +11505,7 @@ static int dev_cpu_dead(unsigned int oldcpu) list_del_init(&napi->poll_list); if (napi->poll == process_backlog) - napi->state = 0; + napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } @@ -11387,21 +11513,23 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); + if (!use_backlog_threads()) { #ifdef CONFIG_RPS - remsd = oldsd->rps_ipi_list; - oldsd->rps_ipi_list = NULL; + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; #endif - /* send out pending IPI's on offline CPU */ - net_rps_send_ipi(remsd); + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; @@ -11718,7 +11846,7 @@ static int net_page_pool_create(int cpuid) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, - .nid = NUMA_NO_NODE, + .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; @@ -11731,6 +11859,38 @@ static int net_page_pool_create(int cpuid) return 0; } +static int backlog_napi_should_run(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); +} + +static void run_backlog_napi(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + + napi_threaded_poll_loop(&sd->backlog); +} + +static void backlog_napi_setup(unsigned int cpu) +{ + struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); + struct napi_struct *napi = &sd->backlog; + + napi->thread = this_cpu_read(backlog_napi); + set_bit(NAPI_STATE_THREADED, &napi->state); +} + +static struct smp_hotplug_thread backlog_threads = { + .store = &backlog_napi, + .thread_should_run = backlog_napi_should_run, + .thread_fn = run_backlog_napi, + .thread_comm = "backlog_napi/%u", + .setup = backlog_napi_setup, +}; + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11782,10 +11942,13 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } + if (use_backlog_threads()) + smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; @@ -11811,6 +11974,10 @@ static int __init net_dev_init(void) NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; + + /* avoid static key IPIs to isolated CPUs */ + if (housekeeping_enabled(HK_TYPE_MISC)) + net_enable_timestamp(); out: if (rc < 0) { for_each_possible_cpu(i) { diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c..b7b518bc2be5 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@ #include <linux/types.h> #include <linux/rwsem.h> +#include <linux/netdevice.h> struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -38,7 +36,6 @@ int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ -extern unsigned int sysctl_skb_defer_max; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; @@ -150,4 +147,23 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c index 4dbd0dc6aea2..8e1dba825e94 100644 --- a/net/core/dev_addr_lists_test.c +++ b/net/core/dev_addr_lists_test.c @@ -49,7 +49,6 @@ static int dev_addr_test_init(struct kunit *test) KUNIT_FAIL(test, "Can't register netdev %d", err); } - rtnl_lock(); return 0; } @@ -57,7 +56,6 @@ static void dev_addr_test_exit(struct kunit *test) { struct net_device *netdev = test->priv; - rtnl_unlock(); unregister_netdev(netdev); free_netdev(netdev); } @@ -67,6 +65,7 @@ static void dev_addr_test_basic(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); memset(addr, 2, sizeof(addr)); @@ -76,6 +75,7 @@ static void dev_addr_test_basic(struct kunit *test) memset(addr, 3, sizeof(addr)); dev_addr_set(netdev, addr); KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr)); + rtnl_unlock(); } static void dev_addr_test_sync_one(struct kunit *test) @@ -86,6 +86,7 @@ static void dev_addr_test_sync_one(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -103,6 +104,7 @@ static void dev_addr_test_sync_one(struct kunit *test) * considered synced and we overwrite in place. */ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_del(struct kunit *test) @@ -114,6 +116,7 @@ static void dev_addr_test_add_del(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); for (i = 1; i < 4; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, @@ -143,6 +146,7 @@ static void dev_addr_test_add_del(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_del_main(struct kunit *test) @@ -150,6 +154,7 @@ static void dev_addr_test_del_main(struct kunit *test) struct net_device *netdev = test->priv; u8 addr[ETH_ALEN]; + rtnl_lock(); memset(addr, 1, sizeof(addr)); eth_hw_addr_set(netdev, addr); @@ -161,6 +166,7 @@ static void dev_addr_test_del_main(struct kunit *test) NETDEV_HW_ADDR_T_LAN)); KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, NETDEV_HW_ADDR_T_LAN)); + rtnl_unlock(); } static void dev_addr_test_add_set(struct kunit *test) @@ -172,6 +178,7 @@ static void dev_addr_test_add_set(struct kunit *test) datp = netdev_priv(netdev); + rtnl_lock(); /* There is no external API like dev_addr_add_excl(), * so shuffle the tree a little bit and exploit aliasing. */ @@ -191,6 +198,7 @@ static void dev_addr_test_add_set(struct kunit *test) __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, dev_addr_test_unsync); KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); + rtnl_unlock(); } static void dev_addr_test_add_excl(struct kunit *test) @@ -199,6 +207,7 @@ static void dev_addr_test_add_excl(struct kunit *test) u8 addr[ETH_ALEN]; int i; + rtnl_lock(); for (i = 0; i < 10; i++) { memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); @@ -213,6 +222,7 @@ static void dev_addr_test_add_excl(struct kunit *test) memset(addr, i, sizeof(addr)); KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); } + rtnl_unlock(); } static struct kunit_case dev_addr_test_cases[] = { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b0f221d658be..430ed18f8584 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -74,7 +74,7 @@ struct net_dm_hw_entries { }; struct per_cpu_dm_data { - spinlock_t lock; /* Protects 'skb', 'hw_entries' and + raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { @@ -168,9 +168,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: - spin_lock_irqsave(&data->lock, flags); + raw_spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; @@ -225,7 +225,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); - spin_lock(&data->lock); + raw_spin_lock(&data->lock); dskb = data->skb; if (!dskb) @@ -259,7 +259,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location) } out: - spin_unlock_irqrestore(&data->lock, flags); + raw_spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, @@ -314,9 +314,9 @@ net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } @@ -448,7 +448,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); - spin_lock_irqsave(&hw_data->lock, flags); + raw_spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) @@ -477,7 +477,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, } out: - spin_unlock_irqrestore(&hw_data->lock, flags); + raw_spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { @@ -1673,7 +1673,7 @@ static struct notifier_block dropmon_net_notifier = { static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { - spin_lock_init(&data->lock); + raw_spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 0ccfd5fa5cb9..6a0482e676d3 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -47,7 +47,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || + if (unlikely(!time_after(idst->refresh_ts, + READ_ONCE(dst_cache->reset_ts)) || (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); @@ -83,7 +84,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) return NULL; *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); + return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -111,8 +112,8 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, return; idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); + dst_cache_per_cpu_dst_set(idst, dst, + rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -170,7 +171,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache) if (!dst_cache->cache) return; - dst_cache->reset_ts = jiffies; + dst_cache_reset(dst_cache); for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3f933ffcefc3..6ebffbc63236 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1142,10 +1142,10 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; - int idx = 0, family; + int err, idx = 0, family; if (cb->strict_check) { - int err = fib_valid_dumprule_req(nlh, cb->extack); + err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; @@ -1158,17 +1158,17 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - dump_rules(skb, cb, ops); - - return skb->len; + return dump_rules(skb, cb, ops); } + err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; - if (dump_rules(skb, cb, ops) < 0) + err = dump_rules(skb, cb, ops); + if (err < 0) break; cb->args[1] = 0; @@ -1178,7 +1178,7 @@ skip: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static void notify_rule_change(int event, struct fib_rule *rule, @@ -1293,7 +1293,8 @@ static int __init fib_rules_init(void) int err; rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, + RTNL_FLAG_DUMP_UNLOCKED); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index ae5254f712c9..2510464692af 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -87,6 +87,9 @@ #include "dev.h" +/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ +static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2215,7 +2218,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { dst = skb_dst(skb); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; @@ -2314,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, rcu_read_lock(); if (!nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); + struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { @@ -4684,7 +4686,7 @@ set_compat: to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) - to->tunnel_flags = info->key.tun_flags; + to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; @@ -4727,7 +4729,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } @@ -4797,15 +4799,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; - if (flags & BPF_F_DONT_FRAGMENT) - info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; - if (flags & BPF_F_SEQ_NUMBER) - info->key.tun_flags |= TUNNEL_SEQ; - if (flags & BPF_F_NO_TUNNEL_KEY) - info->key.tun_flags &= ~TUNNEL_KEY; + __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); + __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, + flags & BPF_F_DONT_FRAGMENT); + __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, + !(flags & BPF_F_ZERO_CSUM_TX)); + __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, + flags & BPF_F_SEQ_NUMBER); + __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, + !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -4843,13 +4845,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); + IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); + ip_tunnel_set_options_present(present); + ip_tunnel_info_opts_set(info, from, size, present); return 0; } @@ -5906,7 +5910,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { - fl4.flowi4_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl4.flowi4_mark = params->mark; + else + fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); @@ -6049,7 +6056,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { - fl6.flowi6_mark = 0; + if (flags & BPF_FIB_LOOKUP_MARK) + fl6.flowi6_mark = params->mark; + else + fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); @@ -6127,7 +6137,7 @@ set_fwd_params: #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ - BPF_FIB_LOOKUP_SRC) + BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) @@ -8364,8 +8374,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 272f09251343..f82e9a7d3b37 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -455,17 +455,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); - if (info->options_len) { - enc_opt->len = info->options_len; - ip_tunnel_info_opts_get(enc_opt->data, info); - enc_opt->dst_opt_type = info->key.tun_flags & - TUNNEL_OPTIONS_PRESENT; - } + if (!info->options_len) + return; + + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + + ip_tunnel_set_options_present(flags); + ip_tunnel_flags_and(flags, info->key.tun_flags, flags); + + val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, + IP_TUNNEL_GENEVE_OPT_BIT); + enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); diff --git a/net/core/gro.c b/net/core/gro.c index c7901253a1a8..e2f84947fb74 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -3,6 +3,7 @@ #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> +#include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 @@ -230,6 +231,33 @@ done: return 0; } +int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk ownership - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + skb->sk = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} + static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { diff --git a/net/core/hotdata.c b/net/core/hotdata.c index c8a7a451c18a..d0aaaaa556f2 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <net/hotdata.h> #include <linux/cache.h> #include <linux/jiffies.h> #include <linux/list.h> - +#include <net/hotdata.h> +#include <net/proto_memory.h> struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), @@ -18,5 +18,8 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .max_backlog = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, + .sysctl_max_skb_frags = MAX_SKB_FRAGS, + .sysctl_skb_defer_max = 64, + .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c new file mode 100644 index 000000000000..759a9b9f3f89 --- /dev/null +++ b/net/core/ieee8021q_helpers.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> + +#include <linux/array_size.h> +#include <linux/printk.h> +#include <linux/types.h> +#include <net/dscp.h> +#include <net/ieee8021q.h> + +/* The following arrays map Traffic Types (TT) to traffic classes (TC) for + * different number of queues as shown in the example provided by + * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and + * Table I-1 "Traffic type to traffic class mapping". + */ +static const u8 ieee8021q_8queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, + [IEEE8021Q_TT_VO] = 5, + [IEEE8021Q_TT_IC] = 6, + [IEEE8021Q_TT_NC] = 7, +}; + +static const u8 ieee8021q_7queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, + [IEEE8021Q_TT_CA] = 3, + [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4, + [IEEE8021Q_TT_IC] = 5, + [IEEE8021Q_TT_NC] = 6, +}; + +static const u8 ieee8021q_6queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, + [IEEE8021Q_TT_BE] = 1, + [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2, + [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3, + [IEEE8021Q_TT_IC] = 4, + [IEEE8021Q_TT_NC] = 5, +}; + +static const u8 ieee8021q_5queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, + [IEEE8021Q_TT_NC] = 4, +}; + +static const u8 ieee8021q_4queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1, + [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2, + [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3, +}; + +static const u8 ieee8021q_3queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2, +}; + +static const u8 ieee8021q_2queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1, + [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1, +}; + +static const u8 ieee8021q_1queue_tt_tc_map[] = { + [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0, + [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0, + [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0, + [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0, +}; + +/** + * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class + * @tt: IEEE 802.1Q Traffic Type + * @num_queues: Number of queues + * + * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based + * on the number of queues configured on the NIC. The mapping is based on the + * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic + * class mapping" and Table I-1 "Traffic type to traffic class mapping". + * + * Return: Traffic Class corresponding to the given Traffic Type or negative + * value in case of error. + */ +int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) +{ + if (tt < 0 || tt >= IEEE8021Q_TT_MAX) { + pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt, + IEEE8021Q_TT_MAX); + return -EINVAL; + } + + switch (num_queues) { + case 8: + compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_8queue_tt_tc_map != max - 1"); + return ieee8021q_8queue_tt_tc_map[tt]; + case 7: + compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_7queue_tt_tc_map != max - 1"); + + return ieee8021q_7queue_tt_tc_map[tt]; + case 6: + compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_6queue_tt_tc_map != max - 1"); + + return ieee8021q_6queue_tt_tc_map[tt]; + case 5: + compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_5queue_tt_tc_map != max - 1"); + + return ieee8021q_5queue_tt_tc_map[tt]; + case 4: + compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_4queue_tt_tc_map != max - 1"); + + return ieee8021q_4queue_tt_tc_map[tt]; + case 3: + compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_3queue_tt_tc_map != max - 1"); + + return ieee8021q_3queue_tt_tc_map[tt]; + case 2: + compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_2queue_tt_tc_map != max - 1"); + + return ieee8021q_2queue_tt_tc_map[tt]; + case 1: + compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != + IEEE8021Q_TT_MAX - 1, + "ieee8021q_1queue_tt_tc_map != max - 1"); + + return ieee8021q_1queue_tt_tc_map[tt]; + } + + pr_err("Invalid number of queues %d\n", num_queues); + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc); + +/** + * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type + * @dscp: IETF DSCP value + * + * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT). + * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic + * Type, this function is inspired by the RFC8325 documentation which describe + * the mapping between DSCP and 802.11 User Priority (UP) values. + * + * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value + */ +int ietf_dscp_to_ieee8021q_tt(u8 dscp) +{ + switch (dscp) { + case DSCP_CS0: + /* Comment from RFC8325: + * [RFC4594], Section 4.8, recommends High-Throughput Data be marked + * AF1x (that is, AF11, AF12, and AF13, according to the rules defined + * in [RFC2475]). + * + * By default (as described in Section 2.3), High-Throughput Data will + * map to UP 1 and, thus, to the Background Access Category (AC_BK), + * which is contrary to the intent expressed in [RFC4594]. + + * Unfortunately, there really is no corresponding fit for the High- + * Throughput Data service class within the constrained 4 Access + * Category [IEEE.802.11-2016] model. If the High-Throughput Data + * service class is assigned to the Best Effort Access Category (AC_BE), + * then it would contend with Low-Latency Data (while [RFC4594] + * recommends a distinction in servicing between these service classes) + * as well as with the default service class; alternatively, if it is + * assigned to the Background Access Category (AC_BK), then it would + * receive a less-then-best-effort service and contend with Low-Priority + * Data (as discussed in Section 4.2.10). + * + * As such, since there is no directly corresponding fit for the High- + * Throughout Data service class within the [IEEE.802.11-2016] model, it + * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby + * admitting it to the Best Effort Access Category (AC_BE). + * + * Note: The above text is from RFC8325 which is describing the mapping + * between DSCP and 802.11 User Priority (UP) values. The mapping + * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but + * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q + * Traffic Types BE and BK. + */ + case DSCP_AF11: + case DSCP_AF12: + case DSCP_AF13: + return IEEE8021Q_TT_BE; + /* Comment from RFC8325: + * RFC3662 and RFC4594 both recommend Low-Priority Data be marked + * with DSCP CS1. The Low-Priority Data service class loosely + * corresponds to the [IEEE.802.11-2016] Background Access Category + */ + case DSCP_CS1: + return IEEE8021Q_TT_BK; + case DSCP_CS2: + case DSCP_AF21: + case DSCP_AF22: + case DSCP_AF23: + return IEEE8021Q_TT_EE; + case DSCP_CS3: + case DSCP_AF31: + case DSCP_AF32: + case DSCP_AF33: + return IEEE8021Q_TT_CA; + case DSCP_CS4: + case DSCP_AF41: + case DSCP_AF42: + case DSCP_AF43: + return IEEE8021Q_TT_VI; + case DSCP_CS5: + case DSCP_EF: + case DSCP_VOICE_ADMIT: + return IEEE8021Q_TT_VO; + case DSCP_CS6: + return IEEE8021Q_TT_IC; + case DSCP_CS7: + return IEEE8021Q_TT_NC; + } + + return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp); +} +EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3..45fd88405b6b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -734,7 +734,9 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); + bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK); + + return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -1769,7 +1771,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; +static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { @@ -1826,13 +1828,19 @@ void neigh_table_init(int index, struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; - neigh_tables[index] = tbl; + rcu_assign_pointer(neigh_tables[index], tbl); } EXPORT_SYMBOL(neigh_table_init); +/* + * Only called from ndisc_cleanup(), which means this is dead code + * because we no longer can unload IPv6 module. + */ int neigh_table_clear(int index, struct neigh_table *tbl) { - neigh_tables[index] = NULL; + RCU_INIT_POINTER(neigh_tables[index], NULL); + synchronize_rcu(); + /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); @@ -1864,10 +1872,10 @@ static struct neigh_table *neigh_find_table(int family) switch (family) { case AF_INET: - tbl = neigh_tables[NEIGH_ARP_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]); break; case AF_INET6: - tbl = neigh_tables[NEIGH_ND_TABLE]; + tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]); break; } @@ -2331,7 +2339,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) @@ -2519,7 +2527,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = neigh_tables[tidx]; + tbl = rcu_dereference_rtnl(neigh_tables[tidx]); if (!tbl) continue; @@ -2674,7 +2682,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = dev ? netdev_master_upper_dev_get(dev) : NULL; + master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL; /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another * invalid value for ifindex to denote "no master". @@ -2707,7 +2715,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct net *net = sock_net(skb->sk); struct neighbour *n; - int rc, h, s_h = cb->args[1]; + int err = 0, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; unsigned int flags = NLM_F_MULTI; @@ -2715,7 +2723,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { @@ -2729,23 +2736,19 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - flags) < 0) { - rc = -1; + err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags); + if (err < 0) goto out; - } next: idx++; } } - rc = skb->len; out: - rcu_read_unlock(); cb->args[1] = h; cb->args[2] = idx; - return rc; + return err; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, @@ -2754,7 +2757,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); - int rc, h, s_h = cb->args[3]; + int err = 0, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; unsigned int flags = NLM_F_MULTI; @@ -2772,11 +2775,11 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || neigh_master_filtered(n->dev, filter->master_idx)) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, flags, tbl) < 0) { + err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, flags, tbl); + if (err < 0) { read_unlock_bh(&tbl->lock); - rc = -1; goto out; } next: @@ -2785,12 +2788,10 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, } read_unlock_bh(&tbl->lock); - rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; - return rc; - + return err; } static int neigh_valid_dump_req(const struct nlmsghdr *nlh, @@ -2878,8 +2879,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; + rcu_read_lock(); for (t = 0; t < NEIGH_NR_TABLES; t++) { - tbl = neigh_tables[t]; + tbl = rcu_dereference(neigh_tables[t]); if (!tbl) continue; @@ -2895,9 +2897,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } + rcu_read_unlock(); cb->args[0] = t; - return skb->len; + return err; } static int neigh_valid_get_req(const struct nlmsghdr *nlh, @@ -3143,14 +3146,15 @@ int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; + if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; - tbl = neigh_tables[index]; - if (!tbl) - goto out; rcu_read_lock(); + tbl = rcu_dereference(neigh_tables[index]); + if (!tbl) + goto out_unlock; if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3166,6 +3170,7 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); +out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3728,7 +3733,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3784,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3807,8 +3811,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; @@ -3889,7 +3891,8 @@ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, + RTNL_FLAG_DUMP_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61..fa6d3969734a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index e3d7a8cfa20b..4c27a360c294 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -605,13 +605,13 @@ static ssize_t threaded_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); + rcu_read_lock(); if (dev_isalive(netdev)) - ret = sysfs_emit(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); + + rcu_read_unlock(); - rtnl_unlock(); return ret; } @@ -1419,7 +1419,7 @@ static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, @@ -1451,7 +1451,7 @@ static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) { - return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); + return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } static ssize_t bql_set_stall_max(struct netdev_queue *queue, @@ -1468,7 +1468,7 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; - return sprintf(buf, "%lu\n", dql->stall_cnt); + return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = @@ -2046,7 +2046,7 @@ static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) net_ns_get_ownership(net, uid, gid); } -static struct class net_class __ro_after_init = { +static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9d690d32da33..4f7a61688d18 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1093,7 +1093,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); - return err < 0 ? err : skb->len; + return err; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, @@ -1208,7 +1208,8 @@ void __init net_ns_init(void) rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED); + RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) diff --git a/net/core/gso_test.c b/net/core/net_test.c index 358c44680d91..9c3a590865d2 100644 --- a/net/core/gso_test.c +++ b/net/core/net_test.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <kunit/test.h> + +/* GSO */ + #include <linux/skbuff.h> static const char hdr[] = "abcdefgh"; @@ -258,17 +261,127 @@ free_gso_skb: consume_skb(skb); } -static struct kunit_case gso_test_cases[] = { - KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), - {} +/* IP tunnel flags */ + +#include <net/ip_tunnels.h> + +struct ip_tunnel_flags_test { + const char *name; + + const u16 *src_bits; + const u16 *exp_bits; + u8 src_num; + u8 exp_num; + + __be16 exp_val; + bool exp_comp; +}; + +#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \ + .name = (n), \ + .src_bits = (src), \ + .src_num = ARRAY_SIZE(src), \ + .exp_comp = (comp), \ + .exp_val = (eval), \ + .exp_bits = (exp), \ + .exp_num = ARRAY_SIZE(exp), \ +} + +/* These are __be16-compatible and can be compared as is */ +static const u16 ip_tunnel_flags_1[] = { + IP_TUNNEL_KEY_BIT, + IP_TUNNEL_STRICT_BIT, + IP_TUNNEL_ERSPAN_OPT_BIT, +}; + +/* Due to the previous flags design limitation, setting either + * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT`` + * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they + * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is + * backward-compatible. + */ +#ifdef __LITTLE_ENDIAN +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT +#else +#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT +#endif + +static const u16 ip_tunnel_flags_2_src[] = { + IP_TUNNEL_CONFLICT_BIT, +}; + +static const u16 ip_tunnel_flags_2_exp[] = { + IP_TUNNEL_CONFLICT_BIT, + IP_TUNNEL_SIT_ISATAP_BIT, }; -static struct kunit_suite gso_test_suite = { - .name = "net_core_gso", - .test_cases = gso_test_cases, +/* Bits 17 and higher are not compatible with __be16 flags */ +static const u16 ip_tunnel_flags_3_src[] = { + IP_TUNNEL_VXLAN_OPT_BIT, + 17, + 18, + 20, }; -kunit_test_suite(gso_test_suite); +static const u16 ip_tunnel_flags_3_exp[] = { + IP_TUNNEL_VXLAN_OPT_BIT, +}; + +static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = { + IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true, + cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) | + BIT(IP_TUNNEL_STRICT_BIT) | + BIT(IP_TUNNEL_ERSPAN_OPT_BIT)), + ip_tunnel_flags_1), + IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true, + VTI_ISVTI, ip_tunnel_flags_2_exp), + IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false, + cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)), + ip_tunnel_flags_3_exp), +}; + +static void +ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} +KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test, + ip_tunnel_flags_test_case_to_desc); + +static void ip_tunnel_flags_test_run(struct kunit *test) +{ + const struct ip_tunnel_flags_test *t = test->param_value; + IP_TUNNEL_DECLARE_FLAGS(src) = { }; + IP_TUNNEL_DECLARE_FLAGS(exp) = { }; + IP_TUNNEL_DECLARE_FLAGS(out); + + for (u32 j = 0; j < t->src_num; j++) + __set_bit(t->src_bits[j], src); + for (u32 j = 0; j < t->exp_num; j++) + __set_bit(t->exp_bits[j], exp); + + KUNIT_ASSERT_EQ(test, t->exp_comp, + ip_tunnel_flags_is_be16_compat(src)); + KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val, + (__force u16)ip_tunnel_flags_to_be16(src)); + + ip_tunnel_flags_from_be16(out, t->exp_val); + KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out)); +} + +static struct kunit_case net_test_cases[] = { + KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params), + KUNIT_CASE_PARAM(ip_tunnel_flags_test_run, + ip_tunnel_flags_test_gen_params), + { }, +}; + +static struct kunit_suite net_test_suite = { + .name = "net_core", + .test_cases = net_test_cases, +}; +kunit_test_suite(net_test_suite); +MODULE_DESCRIPTION("KUnit tests for networking core"); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("KUnit tests for segmentation offload"); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8d8ace9ef87f..8350a0afa9ec 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -70,6 +70,7 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN /* NETDEV_CMD_QSTATS_GET - dump */ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = { + [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7004b3399c2b..4b5054087309 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -489,7 +489,17 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } @@ -498,7 +508,16 @@ static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || - netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes)) + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } @@ -639,6 +658,24 @@ nla_put_failure: return -EMSGSIZE; } +static int +netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, + struct sk_buff *skb, const struct genl_info *info, + struct netdev_nl_dump_ctx *ctx) +{ + if (!netdev->stat_ops) + return 0; + + switch (scope) { + case 0: + return netdev_nl_stats_by_netdev(netdev, skb, info); + case NETDEV_QSTATS_SCOPE_QUEUE: + return netdev_nl_stats_by_queue(netdev, skb, info, ctx); + } + + return -EINVAL; /* Should not happen, per netlink policy */ +} + int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -646,6 +683,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; + unsigned int ifindex; unsigned int scope; int err = 0; @@ -653,21 +691,28 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { - if (!netdev->stat_ops) - continue; + ifindex = 0; + if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) + ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - switch (scope) { - case 0: - err = netdev_nl_stats_by_netdev(netdev, skb, info); - break; - case NETDEV_QSTATS_SCOPE_QUEUE: - err = netdev_nl_stats_by_queue(netdev, skb, info, ctx); - break; + rtnl_lock(); + if (ifindex) { + netdev = __dev_get_by_index(net, ifindex); + if (netdev && netdev->stat_ops) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = netdev ? -EOPNOTSUPP : -ENODEV; + } + } else { + for_each_netdev_dump(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - if (err < 0) - break; } rtnl_unlock(); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 543007f159f9..55bcacf67df3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -316,7 +316,7 @@ static int netpoll_owner_active(struct net_device *dev) struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner == smp_processor_id()) + if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..8bcc7014a61a 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,6 +5,7 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -123,9 +124,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -383,8 +384,8 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, +static void page_pool_dma_sync_for_device(const struct page_pool *pool, + const struct page *page, unsigned int dma_sync_size) { dma_addr_t dma_addr = page_pool_get_dma_addr(page); @@ -550,6 +551,7 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) return page; } EXPORT_SYMBOL(page_pool_alloc_pages); +ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution @@ -690,8 +692,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) + if (allow_direct && page_pool_recycle_in_cache(page, pool)) return NULL; /* Page found as candidate for recycling */ @@ -716,9 +717,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ @@ -747,8 +774,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool allow_direct; bool in_softirq; + allow_direct = page_pool_napi_local(pool); + for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); @@ -756,13 +786,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, if (!page_pool_is_last_ref(page)) continue; - page = __page_pool_put_page(pool, page, -1, false); + page = __page_pool_put_page(pool, page, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (page) data[bulk_len++] = page; } - if (unlikely(!bulk_len)) + if (!bulk_len) return; /* Bulk producer into ptr_ring page_pool cache */ @@ -959,7 +989,7 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; @@ -969,7 +999,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), static void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. - * Paired with READ_ONCE() in napi_pp_put_page(). + * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8ba6a4e4be26..b86b0a87367d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1036,8 +1036,8 @@ static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); - if (dev->proto_down_reason) - size += nla_total_size(0) + nla_total_size(4); + /* Assume dev->proto_down_reason is not zero. */ + size += nla_total_size(0) + nla_total_size(4); return size; } @@ -1477,13 +1477,15 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; + u32 res = 0; - ASSERT_RTNL(); + rcu_read_lock(); + generic_xdp_prog = rcu_dereference(dev->xdp_prog); + if (generic_xdp_prog) + res = generic_xdp_prog->aux->id; + rcu_read_unlock(); - generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (!generic_xdp_prog) - return 0; - return generic_xdp_prog->aux->id; + return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) @@ -1603,7 +1605,8 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) - ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + ret = nla_put_u32(skb, IFLA_MASTER, + READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; @@ -1736,10 +1739,10 @@ static int rtnl_fill_proto_down(struct sk_buff *skb, struct nlattr *pr; u32 preason; - if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; - preason = dev->proto_down_reason; + preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; @@ -1812,6 +1815,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { + char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; @@ -1824,41 +1828,51 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; - ifm->ifi_type = dev->type; - ifm->ifi_index = dev->ifindex; + ifm->ifi_type = READ_ONCE(dev->type); + ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; - qdisc = rtnl_dereference(dev->qdisc); - if (nla_put_string(skb, IFLA_IFNAME, dev->name) || - nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || + netdev_copy_name(dev, devname); + if (nla_put_string(skb, IFLA_IFNAME, devname)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || - nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || - nla_put_u32(skb, IFLA_MTU, dev->mtu) || - nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || - nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || - nla_put_u32(skb, IFLA_GROUP, dev->group) || - nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || - nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || - nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || - nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || - nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || - nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || - nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || - nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || - nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || + netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN) || + nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || + nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || + nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || + nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || + nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || + nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, + READ_ONCE(dev->num_tx_queues)) || + nla_put_u32(skb, IFLA_GSO_MAX_SEGS, + READ_ONCE(dev->gso_max_segs)) || + nla_put_u32(skb, IFLA_GSO_MAX_SIZE, + READ_ONCE(dev->gso_max_size)) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, + READ_ONCE(dev->gro_max_size)) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, + READ_ONCE(dev->gso_ipv4_max_size)) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, + READ_ONCE(dev->gro_ipv4_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, + READ_ONCE(dev->tso_max_size)) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, + READ_ONCE(dev->tso_max_segs)) || #ifdef CONFIG_RPS - nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, + READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (qdisc && - nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1909,9 +1923,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; } - if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) - goto nla_put_failure; - if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; @@ -1924,6 +1935,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); + if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) + goto nla_put_failure_rcu; + qdisc = rcu_dereference(dev->qdisc); + if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) + goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) @@ -5245,15 +5261,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } @@ -5962,19 +5977,17 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; - int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - struct hlist_head *head; + struct { + unsigned long ifindex; + int idxattr; + int prividx; + } *ctx = (void *)cb->ctx; struct net_device *dev; - int idx = 0; - - s_h = cb->args[0]; - s_idx = cb->args[1]; - s_idxattr = cb->args[2]; - s_prividx = cb->args[3]; + int err; cb->seq = net->dev_base_seq; @@ -5993,39 +6006,26 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err) return err; - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, &filters, - &s_idxattr, &s_prividx, - extack); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + for_each_netdev_dump(net, dev, ctx->ifindex) { + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, &filters, + &ctx->idxattr, &ctx->prividx, + extack); + /* If we ran out of room on the first message, + * we're in trouble. + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; - s_prividx = 0; - s_idxattr = 0; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } + if (err < 0) + break; + ctx->prividx = 0; + ctx->idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); } -out: - cb->args[3] = s_prividx; - cb->args[2] = s_idxattr; - cb->args[1] = idx; - cb->args[0] = h; - return skb->len; + return err; } void rtnl_offload_xstats_notify(struct net_device *dev) diff --git a/net/core/scm.c b/net/core/scm.c index 9cd4b0a01cd6..4f6a14babe5a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,12 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -376,8 +382,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1169fdbdb2c3..466999a7515e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -51,6 +51,7 @@ #endif #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/splice.h> #include <linux/cache.h> #include <linux/rtnetlink.h> @@ -108,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #define SKB_SMALL_HEAD_HEADROOM \ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) -int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; -EXPORT_SYMBOL(sysctl_max_skb_frags); - /* kcm_write_msgs() relies on casting paged frags to bio_vec to use * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the * netmem is a page. @@ -775,10 +773,9 @@ skb_fail: EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -787,9 +784,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -860,7 +857,7 @@ skb_success: skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) @@ -1005,11 +1002,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(struct page *page) { - bool allow_direct = false; - struct page_pool *pp; - page = compound_head(page); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -1022,39 +1016,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) if (unlikely(!is_pp_page(page))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - unsigned int cpuid = smp_processor_id(); - - allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; - allow_direct |= READ_ONCE(pp->cpuid) == cpuid; - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_page(page->pp, page, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data)); } /** @@ -1096,12 +1069,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -1109,8 +1082,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -1127,13 +1099,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1194,12 +1166,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1213,7 +1184,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1270,7 +1241,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1331,22 +1302,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) has_trans = skb_transport_header_was_set(skb); printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" - "mac=(%d,%d) net=(%d,%d) trans=%d\n" + "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" - "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" - "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", + "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" + "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" + "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" + "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", level, skb->len, headroom, skb_headlen(skb), tailroom, has_mac ? skb->mac_header : -1, has_mac ? skb_mac_header_len(skb) : -1, + skb->mac_len, skb->network_header, has_trans ? skb_network_header_len(skb) : -1, has_trans ? skb->transport_header : -1, sh->tx_flags, sh->nr_frags, sh->gso_size, sh->gso_type, sh->gso_segs, - skb->csum, skb->ip_summed, skb->csum_complete_sw, - skb->csum_valid, skb->csum_level, + skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level, skb->hash, skb->sw_hash, skb->l4_hash, - ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); + ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, + skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, + skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, + skb->inner_network_header, skb->inner_transport_header); if (dev) printk("%sdev name=%s feat=%pNF\n", @@ -1444,7 +1421,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1471,7 +1448,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1509,7 +1486,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1640,7 +1617,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -2290,9 +2267,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -6598,12 +6575,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6738,7 +6715,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; @@ -7018,6 +6995,19 @@ free_now: EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ +static void kfree_skb_napi_cache(struct sk_buff *skb) +{ + /* if SKB is a clone, don't handle this case */ + if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { + __kfree_skb(skb); + return; + } + + local_bh_disable(); + __napi_kfree_skb(skb, SKB_CONSUMED); + local_bh_enable(); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -7033,10 +7023,10 @@ void skb_attempt_defer_free(struct sk_buff *skb) unsigned int defer_max; bool kick; - if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || - !cpu_online(cpu) || - cpu == raw_smp_processor_id()) { -nodefer: __kfree_skb(skb); + if (cpu == raw_smp_processor_id() || + WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu)) { +nodefer: kfree_skb_napi_cache(skb); return; } @@ -7044,7 +7034,7 @@ nodefer: __kfree_skb(skb); DEBUG_NET_WARN_ON_ONCE(skb->destructor); sd = &per_cpu(softnet_data, cpu); - defer_max = READ_ONCE(sysctl_skb_defer_max); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); if (READ_ONCE(sd->defer_count) >= defer_max) goto nodefer; @@ -7062,8 +7052,8 @@ nodefer: __kfree_skb(skb); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) - smp_call_function_single_async(cpu, &sd->defer_csd); + if (unlikely(kick)) + kick_defer_list_purge(sd, cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7096,7 +7086,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp) { - size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); + size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; ssize_t spliced = 0, ret = 0; unsigned int i; diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a5950..8d6e638b5426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -127,6 +127,7 @@ #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <linux/net_tstamp.h> #include <net/xfrm.h> #include <linux/ipsec.h> @@ -283,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE; int sysctl_tstamp_allow_data __read_mostly = 1; @@ -2526,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w); static bool can_skb_orphan_partial(const struct sk_buff *skb) { -#ifdef CONFIG_TLS_DEVICE /* Drivers depend on in-order delivery for crypto offload, * partial orphan breaks out-of-order-OK logic. */ - if (skb->decrypted) + if (skb_is_decrypted(skb)) return false; -#endif + return (skb->destructor == sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); } @@ -3338,7 +3337,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3353,7 +3352,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3373,7 +3372,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3398,7 +3397,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8598466a3805..9402889840bf 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,8 +24,16 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +/* This mutex is used to + * - protect race between prog/link attach/detach and link prog update, and + * - protect race between releasing and accessing map in bpf_link. + * A single global mutex lock is used since it is expected contention is low. + */ +static DEFINE_MUTEX(sockmap_mutex); + static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); + struct bpf_prog *old, struct bpf_link *link, + u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) @@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); fdput(f); return ret; } @@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) goto put_prog; } - ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type); + mutex_unlock(&sockmap_mutex); put_prog: bpf_prog_put(prog); put_map: @@ -1460,55 +1472,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, - u32 which) +static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + struct bpf_link ***plink, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **cur_pprog; + struct bpf_link **cur_plink; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - *pprog = &progs->msg_parser; + cur_pprog = &progs->msg_parser; + cur_plink = &progs->msg_parser_link; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - *pprog = &progs->stream_parser; + cur_pprog = &progs->stream_parser; + cur_plink = &progs->stream_parser_link; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - *pprog = &progs->stream_verdict; + cur_pprog = &progs->stream_verdict; + cur_plink = &progs->stream_verdict_link; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - *pprog = &progs->skb_verdict; + cur_pprog = &progs->skb_verdict; + cur_plink = &progs->skb_verdict_link; break; default: return -EOPNOTSUPP; } + *pprog = cur_pprog; + if (plink) + *plink = cur_plink; return 0; } +/* Handle the following four cases: + * prog_attach: prog != NULL, old == NULL, link == NULL + * prog_detach: prog == NULL, old != NULL, link == NULL + * link_attach: prog != NULL, old == NULL, link != NULL + * link_detach: prog == NULL, old != NULL, link != NULL + */ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) + struct bpf_prog *old, struct bpf_link *link, + u32 which) { struct bpf_prog **pprog; + struct bpf_link **plink; int ret; - ret = sock_map_prog_lookup(map, &pprog, which); + ret = sock_map_prog_link_lookup(map, &pprog, &plink, which); if (ret) return ret; - if (old) - return psock_replace_prog(pprog, prog, old); + /* for prog_attach/prog_detach/link_attach, return error if a bpf_link + * exists for that prog. + */ + if ((!link || prog) && *plink) + return -EBUSY; - psock_set_prog(pprog, prog); - return 0; + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (!ret) + *plink = NULL; + } else { + psock_set_prog(pprog, prog); + if (link) + *plink = link; + } + + return ret; } int sock_map_bpf_prog_query(const union bpf_attr *attr, @@ -1533,7 +1574,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, rcu_read_lock(); - ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type); if (ret) goto end; @@ -1663,6 +1704,196 @@ void sock_map_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(sock_map_close); +struct sockmap_link { + struct bpf_link link; + struct bpf_map *map; + enum bpf_attach_type attach_type; +}; + +static void sock_map_link_release(struct bpf_link *link) +{ + struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + + mutex_lock(&sockmap_mutex); + if (!sockmap_link->map) + goto out; + + WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, + sockmap_link->attach_type)); + + bpf_map_put_with_uref(sockmap_link->map); + sockmap_link->map = NULL; +out: + mutex_unlock(&sockmap_mutex); +} + +static int sock_map_link_detach(struct bpf_link *link) +{ + sock_map_link_release(link); + return 0; +} + +static void sock_map_link_dealloc(struct bpf_link *link) +{ + kfree(link); +} + +/* Handle the following two cases: + * case 1: link != NULL, prog != NULL, old != NULL + * case 2: link != NULL, prog != NULL, old == NULL + */ +static int sock_map_link_update_prog(struct bpf_link *link, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + struct bpf_prog **pprog, *old_link_prog; + struct bpf_link **plink; + int ret = 0; + + mutex_lock(&sockmap_mutex); + + /* If old prog is not NULL, ensure old prog is the same as link->prog. */ + if (old && link->prog != old) { + ret = -EPERM; + goto out; + } + /* Ensure link->prog has the same type/attach_type as the new prog. */ + if (link->prog->type != prog->type || + link->prog->expected_attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + + ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, + sockmap_link->attach_type); + if (ret) + goto out; + + /* return error if the stored bpf_link does not match the incoming bpf_link. */ + if (link != *plink) { + ret = -EBUSY; + goto out; + } + + if (old) { + ret = psock_replace_prog(pprog, prog, old); + if (ret) + goto out; + } else { + psock_set_prog(pprog, prog); + } + + bpf_prog_inc(prog); + old_link_prog = xchg(&link->prog, prog); + bpf_prog_put(old_link_prog); + +out: + mutex_unlock(&sockmap_mutex); + return ret; +} + +static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link) +{ + u32 map_id = 0; + + mutex_lock(&sockmap_mutex); + if (sockmap_link->map) + map_id = sockmap_link->map->id; + mutex_unlock(&sockmap_mutex); + return map_id; +} + +static int sock_map_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + info->sockmap.map_id = map_id; + info->sockmap.attach_type = sockmap_link->attach_type; + return 0; +} + +static void sock_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link); + u32 map_id = sock_map_link_get_map_id(sockmap_link); + + seq_printf(seq, "map_id:\t%u\n", map_id); + seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); +} + +static const struct bpf_link_ops sock_map_link_ops = { + .release = sock_map_link_release, + .dealloc = sock_map_link_dealloc, + .detach = sock_map_link_detach, + .update_prog = sock_map_link_update_prog, + .fill_link_info = sock_map_link_fill_info, + .show_fdinfo = sock_map_link_show_fdinfo, +}; + +int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct sockmap_link *sockmap_link; + enum bpf_attach_type attach_type; + struct bpf_map *map; + int ret; + + if (attr->link_create.flags) + return -EINVAL; + + map = bpf_map_get_with_uref(attr->link_create.target_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) { + ret = -EINVAL; + goto out; + } + + sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER); + if (!sockmap_link) { + ret = -ENOMEM; + goto out; + } + + attach_type = attr->link_create.attach_type; + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + sockmap_link->map = map; + sockmap_link->attach_type = attach_type; + + ret = bpf_link_prime(&sockmap_link->link, &link_primer); + if (ret) { + kfree(sockmap_link); + goto out; + } + + mutex_lock(&sockmap_mutex); + ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type); + mutex_unlock(&sockmap_mutex); + if (ret) { + bpf_link_cleanup(&link_primer); + goto out; + } + + /* Increase refcnt for the prog since when old prog is replaced with + * psock_replace_prog() and psock_set_prog() its refcnt will be decreased. + * + * Actually, we do not need to increase refcnt for the prog since bpf_link + * will hold a reference. But in order to have less complexity w.r.t. + * replacing/setting prog, let us increase the refcnt to make things simpler. + */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out: + bpf_map_put_with_uref(map); + return ret; +} + static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda..c9fb9ad87485 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> #include <net/hotdata.h> +#include <net/proto_memory.h> #include <net/rps.h> #include "dev.h" @@ -415,7 +416,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "mem_pcpu_rsv", - .data = &sysctl_mem_pcpu_rsv, + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -595,7 +596,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -654,13 +655,12 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -697,7 +697,6 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -715,20 +714,21 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) + tbl[i].data += (char *)net - (char *)&init_net; } - net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, - ARRAY_SIZE(netns_core_table)); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -743,7 +743,7 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 4d9823d6dced..d6b30700af67 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -353,6 +353,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * @sk: socket to perform estimator on + * @mrtt: measured RTT * * This code is almost identical with TCP's tcp_rtt_estimator(), since * - it has a higher sampling frequency (recommended by RFC 1323), diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 44b033fe1ef6..ff41bd6f99c3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -24,6 +24,7 @@ #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/netns/generic.h> +#include <net/rstreason.h> #include "ackvec.h" #include "ccid.h" @@ -521,7 +522,8 @@ out: return err; } -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) +static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, + enum sk_rst_reason reason) { int err; const struct iphdr *rxiph; @@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); kfree_skb(skb); return 0; } @@ -869,7 +871,7 @@ lookup: if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); goto discard_and_relse; } else { sock_put(sk); @@ -909,7 +911,7 @@ no_dccp_socket: if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb); + dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); } discard_it: @@ -1039,7 +1041,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net) static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&dccp_hashinfo, AF_INET); + inet_twsk_purge(&dccp_hashinfo); } static struct pernet_operations dccp_v4_ops = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ded07e09f813..85f4b8fdbe5e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -29,6 +29,7 @@ #include <net/secure_seq.h> #include <net/netns/generic.h> #include <net/sock.h> +#include <net/rstreason.h> #include "dccp.h" #include "ipv6.h" @@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req) kfree_skb(inet_rsk(req)->pktopts); } -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) +static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, + enum sk_rst_reason reason) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; @@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); discard: if (opt_skb != NULL) __kfree_skb(opt_skb); @@ -762,7 +764,7 @@ lookup: if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); goto discard_and_relse; } else { sock_put(sk); @@ -801,7 +803,7 @@ no_dccp_socket: if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb); + dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); } discard_it: @@ -1119,15 +1121,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } -static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET6); -} - static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, - .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 64d805b27add..251a57cf5822 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -15,6 +15,7 @@ #include <net/sock.h> #include <net/xfrm.h> #include <net/inet_timewait_sock.h> +#include <net/rstreason.h> #include "ackvec.h" #include "ccid.h" @@ -202,7 +203,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; drop: if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb); + req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); inet_csk_reqsk_queue_drop(sk, req); out: diff --git a/net/dccp/output.c b/net/dccp/output.c index fd2eb148d24d..5c2e24f3c39b 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index ee8d4f5afa72..3fc474d6e57d 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -90,8 +90,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - - { } }; static struct ctl_table_header *dccp_table_header; diff --git a/net/devlink/core.c b/net/devlink/core.c index 7f0b093208d7..f49cd83f1955 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -314,7 +314,7 @@ static void devlink_release(struct work_struct *work) mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); - kfree(devlink); + kvfree(devlink); } void devlink_put(struct devlink *devlink) @@ -420,7 +420,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, if (!devlink_reload_actions_valid(ops)) return NULL; - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; @@ -455,7 +455,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, return devlink; err_xa_alloc: - kfree(devlink); + kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 19dbf540748a..13c73f50da3d 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -1202,23 +1202,19 @@ static void __devlink_compat_running_version(struct devlink *devlink, if (err) goto free_msg; - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING, + (void *)msg->data, msg->len, rem) { const struct nlattr *kv; int rem_kv; - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - + nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE, + nlattr, rem_kv) { strlcat(buf, nla_data(kv), len); strlcat(buf, " ", len); } } free_msg: - nlmsg_free(msg); + nlmsg_consume(msg); } void devlink_compat_running_version(struct devlink *devlink, diff --git a/net/devlink/param.c b/net/devlink/param.c index 22bc3b500518..dcf0d1ccebba 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -158,11 +158,12 @@ static int devlink_param_get(struct devlink *devlink, static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { if (!param->set) return -EOPNOTSUPP; - return param->set(devlink, param->id, ctx); + return param->set(devlink, param->id, ctx, extack); } static int @@ -571,7 +572,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx); + err = devlink_param_set(devlink, param, &ctx, info->extack); if (err) return err; } diff --git a/net/devlink/port.c b/net/devlink/port.c index 118d130d2afd..be9158b4453c 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -16,6 +16,7 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), + [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 }, }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ @@ -182,6 +183,30 @@ static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, return 0; } +static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + u32 max_io_eqs; + int err; + + if (!port->ops->port_fn_max_io_eqs_get) + return 0; + + err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs); + if (err) + return err; + *msg_updated = true; + return 0; +} + int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) @@ -410,6 +435,18 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, } static int +devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + u32 max_io_eqs; + + max_io_eqs = nla_get_u32(attr); + return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port, + max_io_eqs, extack); +} + +static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { @@ -430,6 +467,9 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; + err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated); + if (err) + goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, @@ -726,6 +766,12 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, } } } + if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] && + !ops->port_fn_max_io_eqs_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS], + "Function does not support max_io_eqs setting"); + return -EOPNOTSUPP; + } return 0; } @@ -761,6 +807,13 @@ static int devlink_port_function_set(struct devlink_port *port, return err; } + attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS]; + if (attr) { + err = devlink_port_fn_max_io_eqs_set(port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index 431bf52290a1..0aac887d0098 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -194,7 +194,8 @@ int dsa_devlink_param_get(struct devlink *dl, u32 id, EXPORT_SYMBOL_GPL(dsa_devlink_param_get); int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dsa_devlink_to_ds(dl); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 09d2f5d4b3dd..12521a7d4048 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1505,6 +1505,16 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; + if (ds->phylink_mac_ops) { + if (ds->ops->phylink_mac_select_pcs || + ds->ops->phylink_mac_prepare || + ds->ops->phylink_mac_config || + ds->ops->phylink_mac_finish || + ds->ops->phylink_mac_link_down || + ds->ops->phylink_mac_link_up) + return -EINVAL; + } + if (np) { err = dsa_switch_parse_of(ds, np); if (err) diff --git a/net/dsa/port.c b/net/dsa/port.c index c42dac87671b..9a249d4ac3a5 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1535,30 +1535,11 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } -static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) -{ - struct device_node *phy_dn; - struct phy_device *phydev; - - phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0); - if (!phy_dn) - return NULL; - - phydev = of_phy_find_device(phy_dn); - if (!phydev) { - of_node_put(phy_dn); - return ERR_PTR(-EPROBE_DEFER); - } - - of_node_put(phy_dn); - return phydev; -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP); struct dsa_switch *ds = dp->ds; @@ -1572,7 +1553,7 @@ static int dsa_port_phylink_mac_prepare(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; int err = 0; @@ -1587,7 +1568,7 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; if (!ds->ops->phylink_mac_config) @@ -1600,7 +1581,7 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; int err = 0; @@ -1615,18 +1596,11 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct phy_device *phydev = NULL; + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; - if (dsa_port_is_user(dp)) - phydev = dp->user->phydev; - - if (!ds->ops->phylink_mac_link_down) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_down) return; - } ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } @@ -1638,14 +1612,11 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, int speed, int duplex, bool tx_pause, bool rx_pause) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; - if (!ds->ops->phylink_mac_link_up) { - if (ds->ops->adjust_link && phydev) - ds->ops->adjust_link(ds, dp->index, phydev); + if (!ds->ops->phylink_mac_link_up) return; - } ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev, speed, duplex, tx_pause, rx_pause); @@ -1662,6 +1633,7 @@ static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { int dsa_port_phylink_create(struct dsa_port *dp) { + const struct phylink_mac_ops *mac_ops; struct dsa_switch *ds = dp->ds; phy_interface_t mode; struct phylink *pl; @@ -1685,8 +1657,12 @@ int dsa_port_phylink_create(struct dsa_port *dp) } } - pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), - mode, &dsa_port_phylink_mac_ops); + mac_ops = &dsa_port_phylink_mac_ops; + if (ds->phylink_mac_ops) + mac_ops = ds->phylink_mac_ops; + + pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode, + mac_ops); if (IS_ERR(pl)) { pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl)); return PTR_ERR(pl); @@ -1703,78 +1679,6 @@ void dsa_port_phylink_destroy(struct dsa_port *dp) dp->pl = NULL; } -static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable) -{ - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - int err = 0; - - phydev = dsa_port_get_phy_device(dp); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (enable) { - err = genphy_resume(phydev); - if (err < 0) - goto err_put_dev; - - err = genphy_read_status(phydev); - if (err < 0) - goto err_put_dev; - } else { - err = genphy_suspend(phydev); - if (err < 0) - goto err_put_dev; - } - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev)); - -err_put_dev: - put_device(&phydev->mdio.dev); - return err; -} - -static int dsa_shared_port_fixed_link_register_of(struct dsa_port *dp) -{ - struct device_node *dn = dp->dn; - struct dsa_switch *ds = dp->ds; - struct phy_device *phydev; - int port = dp->index; - phy_interface_t mode; - int err; - - err = of_phy_register_fixed_link(dn); - if (err) { - dev_err(ds->dev, - "failed to register the fixed PHY of port %d\n", - port); - return err; - } - - phydev = of_phy_find_device(dn); - - err = of_get_phy_mode(dn, &mode); - if (err) - mode = PHY_INTERFACE_MODE_NA; - phydev->interface = mode; - - genphy_read_status(phydev); - - if (ds->ops->adjust_link) - ds->ops->adjust_link(ds, port, phydev); - - put_device(&phydev->mdio.dev); - - return 0; -} - static int dsa_shared_port_phylink_register(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; @@ -1952,12 +1856,23 @@ static void dsa_shared_port_validate_of(struct dsa_port *dp, dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); } +static void dsa_shared_port_link_down(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down) + ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED, + PHY_INTERFACE_MODE_NA); + else if (ds->ops->phylink_mac_link_down) + ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED, + PHY_INTERFACE_MODE_NA); +} + int dsa_shared_port_link_register_of(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; bool missing_link_description; bool missing_phy_mode; - int port = dp->index; dsa_shared_port_validate_of(dp, &missing_phy_mode, &missing_link_description); @@ -1967,46 +1882,28 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp) dsa_switches_apply_workarounds)) return -EINVAL; - if (!ds->ops->adjust_link) { - if (missing_link_description) { - dev_warn(ds->dev, - "Skipping phylink registration for %s port %d\n", - dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); - } else { - if (ds->ops->phylink_mac_link_down) - ds->ops->phylink_mac_link_down(ds, port, - MLO_AN_FIXED, PHY_INTERFACE_MODE_NA); + if (missing_link_description) { + dev_warn(ds->dev, + "Skipping phylink registration for %s port %d\n", + dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index); + } else { + dsa_shared_port_link_down(dp); - return dsa_shared_port_phylink_register(dp); - } - return 0; + return dsa_shared_port_phylink_register(dp); } - dev_warn(ds->dev, - "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n"); - - if (of_phy_is_fixed_link(dp->dn)) - return dsa_shared_port_fixed_link_register_of(dp); - else - return dsa_shared_port_setup_phy_of(dp, true); + return 0; } void dsa_shared_port_link_unregister_of(struct dsa_port *dp) { - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->adjust_link && dp->pl) { + if (dp->pl) { rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); return; } - - if (of_phy_is_fixed_link(dp->dn)) - of_phy_deregister_fixed_link(dp->dn); - else - dsa_shared_port_setup_phy_of(dp, false); } int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr, diff --git a/net/dsa/user.c b/net/dsa/user.c index 16d395bb1a1f..867c5fe9a4da 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -2120,7 +2120,7 @@ int dsa_user_change_mtu(struct net_device *dev, int new_mtu) if (err) goto out_port_failed; - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); @@ -2137,6 +2137,32 @@ out_conduit_failed: } static int __maybe_unused +dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_set_apptrust) + return -EOPNOTSUPP; + + return ds->ops->port_set_apptrust(ds, port, sel, nsel); +} + +static int __maybe_unused +dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_get_apptrust) + return -EOPNOTSUPP; + + return ds->ops->port_get_apptrust(ds, port, sel, nsel); +} + +static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); @@ -2163,6 +2189,58 @@ dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) return 0; } +/* Update the DSCP prio entries on all user ports of the switch in case + * the switch supports global DSCP prio instead of per port DSCP prios. + */ +static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, + struct dcb_app *app, bool del) +{ + int (*setdel)(struct net_device *dev, struct dcb_app *app); + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; + int err, restore_err; + + if (del) + setdel = dcb_ieee_delapp; + else + setdel = dcb_ieee_setapp; + + dsa_switch_for_each_user_port(other_dp, ds) { + struct net_device *user = other_dp->user; + + if (!user || user == dev) + continue; + + err = setdel(user, app); + if (err) + goto err_try_to_restore; + } + + return 0; + +err_try_to_restore: + + /* Revert logic to restore previous state of app entries */ + if (!del) + setdel = dcb_ieee_delapp; + else + setdel = dcb_ieee_setapp; + + dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { + struct net_device *user = other_dp->user; + + if (!user || user == dev) + continue; + + restore_err = setdel(user, app); + if (restore_err) + netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); + } + + return err; +} + static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { @@ -2194,6 +2272,17 @@ dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) return err; } + if (!ds->dscp_prio_mapping_is_global) + return 0; + + err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); + if (err) { + if (ds->ops->port_del_dscp_prio) + ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); + dcb_ieee_delapp(dev, app); + return err; + } + return 0; } @@ -2264,6 +2353,18 @@ dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) return err; } + if (!ds->dscp_prio_mapping_is_global) + return 0; + + err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); + if (err) { + if (ds->ops->port_add_dscp_prio) + ds->ops->port_add_dscp_prio(ds, port, dscp, + app->priority); + dcb_ieee_setapp(dev, app); + return err; + } + return 0; } @@ -2376,6 +2477,8 @@ static const struct ethtool_ops dsa_user_ethtool_ops = { static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, + .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, + .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, @@ -2445,7 +2548,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); + struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index bd04f28d5cf4..563e94e0cbd8 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -4,6 +4,7 @@ #include <linux/ethtool_netlink.h> #include <linux/pm_runtime.h> #include "netlink.h" +#include <linux/phy_link_topology.h> static struct genl_family ethtool_genl_family; @@ -30,6 +31,24 @@ const struct nla_policy ethnl_header_policy_stats[] = { ETHTOOL_FLAGS_STATS), }; +const struct nla_policy ethnl_header_policy_phy[] = { + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, + ETHTOOL_FLAGS_BASIC), + [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +const struct nla_policy ethnl_header_policy_phy_stats[] = { + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, + ETHTOOL_FLAGS_STATS), + [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + int ethnl_ops_begin(struct net_device *dev) { int ret; @@ -89,8 +108,9 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { - struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)]; + struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; + struct phy_device *phydev = NULL; struct net_device *dev = NULL; u32 flags = 0; int ret; @@ -104,7 +124,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ - ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header, + ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; @@ -145,6 +165,30 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, return -EINVAL; } + if (dev) { + if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { + struct nlattr *phy_id; + + phy_id = tb[ETHTOOL_A_HEADER_PHY_INDEX]; + phydev = phy_link_topo_get_phy(dev->link_topo, + nla_get_u32(phy_id)); + if (!phydev) { + NL_SET_BAD_ATTR(extack, phy_id); + return -ENODEV; + } + } else { + /* If we need a PHY but no phy index is specified, fallback + * to dev->phydev + */ + phydev = dev->phydev; + } + } else if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { + NL_SET_ERR_MSG_ATTR(extack, header, + "can't target a PHY without a netdev"); + return -EINVAL; + } + + req_info->phydev = phydev; req_info->dev = dev; req_info->flags = flags; return 0; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a333a8d04c1..d57a890b5d9e 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -250,6 +250,7 @@ static inline unsigned int ethnl_reply_header_size(void) * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types + * @phydev: phy_device connected to @dev this request is for (may be null) * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at @@ -259,6 +260,7 @@ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; + struct phy_device *phydev; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) @@ -395,9 +397,12 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; +extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; +extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; +extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index cc478af77111..2c981d443f27 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -82,6 +82,10 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */ + if (st->c33_admin_state > 0) + len += nla_total_size(sizeof(u32)); /* _C33_PSE_ADMIN_STATE */ + if (st->c33_pw_status > 0) + len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_D_STATUS */ return len; } @@ -103,6 +107,16 @@ static int pse_fill_reply(struct sk_buff *skb, st->podl_pw_status)) return -EMSGSIZE; + if (st->c33_admin_state > 0 && + nla_put_u32(skb, ETHTOOL_A_C33_PSE_ADMIN_STATE, + st->c33_admin_state)) + return -EMSGSIZE; + + if (st->c33_pw_status > 0 && + nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_D_STATUS, + st->c33_pw_status)) + return -EMSGSIZE; + return 0; } @@ -113,25 +127,18 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), + [ETHTOOL_A_C33_PSE_ADMIN_CONTROL] = + NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED), }; static int ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) { - return !!info->attrs[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]; -} - -static int -ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) -{ struct net_device *dev = req_info->dev; - struct pse_control_config config = {}; struct nlattr **tb = info->attrs; struct phy_device *phydev; - /* this values are already validated by the ethnl_pse_set_policy */ - config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); - phydev = dev->phydev; if (!phydev) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); @@ -143,6 +150,39 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; } + if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] && + !pse_has_podl(phydev->psec)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL], + "setting PoDL PSE admin control not supported"); + return -EOPNOTSUPP; + } + if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL] && + !pse_has_c33(phydev->psec)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL], + "setting C33 PSE admin control not supported"); + return -EOPNOTSUPP; + } + + return 1; +} + +static int +ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct net_device *dev = req_info->dev; + struct pse_control_config config = {}; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; + + phydev = dev->phydev; + /* These values are already validated by the ethnl_pse_set_policy */ + if (pse_has_podl(phydev->psec)) + config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); + if (pse_has_c33(phydev->psec)) + config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]); + /* Return errno directly - PSE has no notification */ return pse_ethtool_set_config(phydev->psec, info->extack, &config); } diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 9daed0aab162..be2755c8d8fd 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -13,14 +13,18 @@ struct tsinfo_req_info { struct tsinfo_reply_data { struct ethnl_reply_data base; struct ethtool_ts_info ts_info; + struct ethtool_ts_stats stats; }; #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) +#define ETHTOOL_TS_STAT_CNT \ + (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) + const struct nla_policy ethnl_tsinfo_get_policy[] = { [ETHTOOL_A_TSINFO_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, @@ -34,6 +38,12 @@ static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_ts_stats) { + ethtool_stats_init((u64 *)&data->stats, + sizeof(data->stats) / sizeof(u64)); + dev->ethtool_ops->get_ts_stats(dev, &data->stats); + } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); @@ -79,10 +89,47 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, } if (ts_info->phc_index >= 0) len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += nla_total_size(0) + /* _TSINFO_STATS */ + nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } +static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_uint(skb, attrtype, val)) + return -EMSGSIZE; + return 0; +} + +static int tsinfo_put_stats(struct sk_buff *skb, + const struct ethtool_ts_stats *stats) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); + if (!nest) + return -EMSGSIZE; + + if (tsinfo_put_stat(skb, stats->tx_stats.pkts, + ETHTOOL_A_TS_STAT_TX_PKTS) || + tsinfo_put_stat(skb, stats->tx_stats.lost, + ETHTOOL_A_TS_STAT_TX_LOST) || + tsinfo_put_stat(skb, stats->tx_stats.err, + ETHTOOL_A_TS_STAT_TX_ERR)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) @@ -119,6 +166,9 @@ static int tsinfo_fill_reply(struct sk_buff *skb, if (ts_info->phc_index >= 0 && nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && + tsinfo_put_stats(skb, &data->stats)) + return -EMSGSIZE; return 0; } diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index d697f68c598c..d6f52839827e 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -213,7 +213,6 @@ static int tls_handshake_accept(struct handshake_req *req, if (!hdr) goto out_cancel; - ret = -EMSGSIZE; ret = nla_put_s32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd); if (ret < 0) goto out_cancel; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 5afc450d0855..e6904288d40d 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -120,7 +120,7 @@ static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } @@ -143,6 +143,9 @@ static int hsr_dev_open(struct net_device *dev) case HSR_PT_SLAVE_B: designation = "Slave B"; break; + case HSR_PT_INTERLINK: + designation = "Interlink"; + break; default: designation = "Unknown"; } @@ -282,6 +285,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, struct hsr_priv *hsr = master->hsr; __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; @@ -321,6 +325,16 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); + if (hsr->redbox) { + hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv)); + hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC; + hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload); + + /* Payload: MacAddressRedBox */ + hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); + ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox); + } + if (skb_put_padto(skb, ETH_ZLEN)) { spin_unlock_bh(&hsr->seqnr_lock); return; @@ -402,6 +416,10 @@ void hsr_del_ports(struct hsr_priv *hsr) if (port) hsr_del_port(port); + port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (port) + hsr_del_port(port); + port = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port) hsr_del_port(port); @@ -531,8 +549,8 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec, u8 protocol_version, - struct netlink_ext_ack *extack) + struct net_device *interlink, unsigned char multicast_spec, + u8 protocol_version, struct netlink_ext_ack *extack) { bool unregister = false; struct hsr_priv *hsr; @@ -541,6 +559,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); + INIT_LIST_HEAD(&hsr->proxy_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); @@ -566,9 +585,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; + hsr->interlink_sequence_nr = HSR_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); + timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; @@ -601,6 +622,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (interlink) { + res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); + if (res) + goto err_unregister; + + hsr->redbox = true; + ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr); + mod_timer(&hsr->prune_proxy_timer, + jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); + } + hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 9060c92168f9..655284095b78 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -16,8 +16,8 @@ void hsr_del_ports(struct hsr_priv *hsr); void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec, u8 protocol_version, - struct netlink_ext_ack *extack); + struct net_device *interlink, unsigned char multicast_spec, + u8 protocol_version, struct netlink_ext_ack *extack); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); int hsr_get_max_mtu(struct hsr_priv *hsr); #endif /* __HSR_DEVICE_H */ diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 5d68cb181695..05a61b8286ec 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -377,6 +377,15 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, */ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); } + + /* When HSR node is used as RedBox - the frame received from HSR ring + * requires source MAC address (SA) replacement to one which can be + * recognized by SAN devices (otherwise, frames are dropped by switch) + */ + if (port->type == HSR_PT_INTERLINK) + ether_addr_copy(eth_hdr(skb)->h_source, + port->hsr->macaddress_redbox); + return dev_queue_xmit(skb); } @@ -390,9 +399,57 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { + struct sk_buff *skb; + if (port->dev->features & NETIF_F_HW_HSR_FWD) return prp_drop_frame(frame, port); + /* RedBox specific frames dropping policies + * + * Do not send HSR supervisory frames to SAN devices + */ + if (frame->is_supervision && port->type == HSR_PT_INTERLINK) + return true; + + /* Do not forward to other HSR port (A or B) unicast frames which + * are addressed to interlink port (and are in the ProxyNodeTable). + */ + skb = frame->skb_hsr; + if (skb && prp_drop_frame(frame, port) && + is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + + /* Do not forward to port C (Interlink) frames from nodes A and B + * if DA is in NodeTable. + */ + if ((frame->port_rcv->type == HSR_PT_SLAVE_A || + frame->port_rcv->type == HSR_PT_SLAVE_B) && + port->type == HSR_PT_INTERLINK) { + skb = frame->skb_hsr; + if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + } + + /* Do not forward to port A and B unicast frames received on the + * interlink port if it is addressed to one of nodes registered in + * the ProxyNodeTable. + */ + if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) && + frame->port_rcv->type == HSR_PT_INTERLINK) { + skb = frame->skb_std; + if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + hsr_is_node_in_db(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest)) { + return true; + } + } + return false; } @@ -448,13 +505,14 @@ static void hsr_forward_do(struct hsr_frame_info *frame) } /* Check if frame is to be dropped. Eg. for PRP no forward - * between ports. + * between ports, or sending HSR supervision to RedBox. */ if (hsr->proto_ops->drop_frame && hsr->proto_ops->drop_frame(frame, port)) continue; - if (port->type != HSR_PT_MASTER) + if (port->type == HSR_PT_SLAVE_A || + port->type == HSR_PT_SLAVE_B) skb = hsr->proto_ops->create_tagged_frame(frame, port); else skb = hsr->proto_ops->get_untagged_frame(frame, port); @@ -469,7 +527,9 @@ static void hsr_forward_do(struct hsr_frame_info *frame) hsr_deliver_master(skb, port->dev, frame->node_src); } else { if (!hsr_xmit(skb, port, frame)) - sent = true; + if (port->type == HSR_PT_SLAVE_A || + port->type == HSR_PT_SLAVE_B) + sent = true; } } } @@ -503,10 +563,12 @@ static void handle_std_frame(struct sk_buff *skb, frame->skb_prp = NULL; frame->skb_std = skb; - if (port->type != HSR_PT_MASTER) { + if (port->type != HSR_PT_MASTER) frame->is_from_san = true; - } else { - /* Sequence nr for the master node */ + + if (port->type == HSR_PT_MASTER || + port->type == HSR_PT_INTERLINK) { + /* Sequence nr for the master/interlink node */ lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; @@ -564,6 +626,7 @@ static int fill_frame_info(struct hsr_frame_info *frame, { struct hsr_priv *hsr = port->hsr; struct hsr_vlan_ethhdr *vlan_hdr; + struct list_head *n_db; struct ethhdr *ethhdr; __be16 proto; int ret; @@ -574,9 +637,13 @@ static int fill_frame_info(struct hsr_frame_info *frame, memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); - frame->node_src = hsr_get_node(port, &hsr->node_db, skb, - frame->is_supervision, - port->type); + + n_db = &hsr->node_db; + if (port->type == HSR_PT_INTERLINK) + n_db = &hsr->proxy_node_db; + + frame->node_src = hsr_get_node(port, n_db, skb, + frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 26329db09210..614df9649794 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -71,6 +71,14 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, return NULL; } +/* Check if node for a given MAC address is already present in data base + */ +bool hsr_is_node_in_db(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]) +{ + return !!find_node_by_addr_A(node_db, addr); +} + /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ @@ -223,6 +231,15 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, } } + /* Check if required node is not in proxy nodes table */ + list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { + if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { + if (hsr->proto_ops->update_san_info) + hsr->proto_ops->update_san_info(node, is_sup); + return node; + } + } + /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ @@ -418,6 +435,10 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); + if (!node_dst && port->hsr->redbox) + node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db, + eth_hdr(skb)->h_dest); + if (!node_dst) { if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); @@ -561,6 +582,37 @@ void hsr_prune_nodes(struct timer_list *t) jiffies + msecs_to_jiffies(PRUNE_PERIOD)); } +void hsr_prune_proxy_nodes(struct timer_list *t) +{ + struct hsr_priv *hsr = from_timer(hsr, t, prune_proxy_timer); + unsigned long timestamp; + struct hsr_node *node; + struct hsr_node *tmp; + + spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) { + timestamp = node->time_in[HSR_PT_INTERLINK]; + + /* Prune old entries */ + if (time_is_before_jiffies(timestamp + + msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) { + hsr_nl_nodedown(hsr, node->macaddress_A); + if (!node->removed) { + list_del_rcu(&node->mac_list); + node->removed = true; + /* Note that we need to free this entry later: */ + kfree_rcu(node, rcu_head); + } + } + } + + spin_unlock_bh(&hsr->list_lock); + + /* Restart timer */ + mod_timer(&hsr->prune_proxy_timer, + jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD)); +} + void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index b23556251d62..7619e31c1d2d 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -46,6 +46,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr); void hsr_prune_nodes(struct timer_list *t); +void hsr_prune_proxy_nodes(struct timer_list *t); int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], @@ -67,6 +68,9 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node); void prp_update_san_info(struct hsr_node *node, bool is_sup); +bool hsr_is_node_in_db(struct list_head *node_db, + const unsigned char addr[ETH_ALEN]); + struct hsr_node { struct list_head mac_list; /* Protect R/W access to seq_out */ diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 9756e657bab9..d7ae32473c41 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -96,7 +96,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, break; /* Handled in ndo_change_mtu() */ mtu_max = hsr_get_max_mtu(port->hsr); master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); - master->dev->mtu = mtu_max; + WRITE_ONCE(master->dev->mtu, mtu_max); break; case NETDEV_UNREGISTER: if (!is_hsr_master(dev)) { diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 18e01791ad79..23850b16d1ea 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -21,6 +21,7 @@ */ #define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */ #define HSR_NODE_FORGET_TIME 60000 /* ms */ +#define HSR_PROXY_NODE_FORGET_TIME 60000 /* ms */ #define HSR_ANNOUNCE_INTERVAL 100 /* ms */ #define HSR_ENTRY_FORGET_TIME 400 /* ms */ @@ -35,6 +36,7 @@ * HSR_NODE_FORGET_TIME? */ #define PRUNE_PERIOD 3000 /* ms */ +#define PRUNE_PROXY_PERIOD 3000 /* ms */ #define HSR_TLV_EOT 0 /* End of TLVs */ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 @@ -192,11 +194,14 @@ struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; struct list_head node_db; /* Known HSR nodes */ + struct list_head proxy_node_db; /* RedBox HSR proxy nodes */ struct hsr_self_node __rcu *self_node; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ struct timer_list prune_timer; + struct timer_list prune_proxy_timer; int announce_count; u16 sequence_nr; + u16 interlink_sequence_nr; /* Interlink port seq_nr */ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ @@ -209,6 +214,8 @@ struct hsr_priv { * of lan_id */ bool fwd_offloaded; /* Forwarding offloaded to HW */ + bool redbox; /* Device supports HSR RedBox */ + unsigned char macaddress_redbox[ETH_ALEN]; unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16)); /* Align to u16 boundary to avoid unaligned access * in ether_addr_equal diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 78fe40eb9f01..898f18c6da53 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -23,6 +23,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, + [IFLA_HSR_INTERLINK] = { .type = NLA_U32 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -35,8 +36,8 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; - struct net_device *link[2]; + struct net_device *link[2], *interlink = NULL; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); return -EINVAL; @@ -67,6 +68,20 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } + if (data[IFLA_HSR_INTERLINK]) + interlink = __dev_get_by_index(src_net, + nla_get_u32(data[IFLA_HSR_INTERLINK])); + + if (interlink && interlink == link[0]) { + NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave1 are the same"); + return -EINVAL; + } + + if (interlink && interlink == link[1]) { + NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave2 are the same"); + return -EINVAL; + } + if (!data[IFLA_HSR_MULTICAST_SPEC]) multicast_spec = 0; else @@ -96,10 +111,17 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, } } - if (proto == HSR_PROTOCOL_PRP) + if (proto == HSR_PROTOCOL_PRP) { proto_version = PRP_V1; + if (interlink) { + NL_SET_ERR_MSG_MOD(extack, + "Interlink only works with HSR"); + return -EINVAL; + } + } - return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); + return hsr_dev_finalize(dev, link, interlink, multicast_spec, + proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -107,6 +129,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) struct hsr_priv *hsr = netdev_priv(dev); del_timer_sync(&hsr->prune_timer); + del_timer_sync(&hsr->prune_proxy_timer); del_timer_sync(&hsr->announce_timer); hsr_debugfs_term(hsr); @@ -114,6 +137,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) hsr_del_self_node(hsr); hsr_del_nodes(&hsr->node_db); + hsr_del_nodes(&hsr->proxy_node_db); unregister_netdevice_queue(dev, head); } diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 1b6457f357bd..af6cf64a00e0 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -55,6 +55,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && + port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6dd960ec558c..56ef873828f4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -338,7 +338,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -351,7 +350,6 @@ static struct ctl_table lowpan_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) @@ -370,10 +368,8 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; @@ -399,7 +395,7 @@ err_alloc: static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fafb123f798b..a7bad18bc8b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1072,6 +1072,7 @@ const struct proto_ops inet_stream_ops = { #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, @@ -1306,8 +1307,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) int inet_sk_rebuild_header(struct sock *sk) { + struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0d0d725b46ad..11c1519b3699 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -456,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) /*unsigned long now; */ struct net *net = dev_net(dev); - rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev)); + rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), + RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { @@ -1002,6 +1003,55 @@ out_of_mem: * User level interface (ioctl) */ +static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, + bool getarp) +{ + struct net_device *dev; + + if (getarp) + dev = dev_get_by_name_rcu(net, r->arp_dev); + else + dev = __dev_get_by_name(net, r->arp_dev); + if (!dev) + return ERR_PTR(-ENODEV); + + /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ + if (!r->arp_ha.sa_family) + r->arp_ha.sa_family = dev->type; + + if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) + return ERR_PTR(-EINVAL); + + return dev; +} + +static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) +{ + struct net_device *dev; + struct rtable *rt; + __be32 ip; + + if (r->arp_dev[0]) + return arp_req_dev_by_name(net, r, false); + + if (r->arp_flags & ATF_PUBL) + return NULL; + + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + + rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); + if (IS_ERR(rt)) + return ERR_CAST(rt); + + dev = rt->dst.dev; + ip_rt_put(rt); + + if (!dev) + return ERR_PTR(-EINVAL); + + return dev; +} + /* * Set (create) an ARP cache entry. */ @@ -1022,11 +1072,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask && mask != htonl(0xFFFFFFFF)) - return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); @@ -1034,6 +1081,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -ENODEV; } if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; @@ -1042,29 +1091,20 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return arp_req_set_proxy(net, dev, 1); } -static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_set(struct net *net, struct arpreq *r) { - __be32 ip; struct neighbour *neigh; + struct net_device *dev; + __be32 ip; int err; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); - ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (r->arp_flags & ATF_PERM) - r->arp_flags |= ATF_COM; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); - - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: @@ -1086,12 +1126,18 @@ static int arp_req_set(struct net *net, struct arpreq *r, break; } + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; - if (r->arp_flags & ATF_PERM) + + if (r->arp_flags & ATF_PERM) { + r->arp_flags |= ATF_COM; state = NUD_PERMANENT; + } + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | @@ -1115,27 +1161,40 @@ static unsigned int arp_state_to_flags(struct neighbour *neigh) * Get an ARP cache entry. */ -static int arp_req_get(struct arpreq *r, struct net_device *dev) +static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; - int err = -ENXIO; + struct net_device *dev; + + if (!r->arp_dev[0]) + return -ENODEV; + + dev = arp_req_dev_by_name(net, r, true); + if (IS_ERR(dev)) + return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); - if (neigh) { - if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) { - read_lock_bh(&neigh->lock); - memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); - r->arp_flags = arp_state_to_flags(neigh); - read_unlock_bh(&neigh->lock); - r->arp_ha.sa_family = dev->type; - strscpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); - err = 0; - } + if (!neigh) + return -ENXIO; + + if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); + return -ENXIO; } - return err; + + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, + min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + + neigh_release(neigh); + + r->arp_ha.sa_family = dev->type; + netdev_copy_name(dev, r->arp_dev); + + return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) @@ -1166,36 +1225,31 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { - __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask == htonl(0xFFFFFFFF)) - return pneigh_delete(&arp_tbl, net, &ip, dev); + if (mask) { + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (mask) - return -EINVAL; + return pneigh_delete(&arp_tbl, net, &ip, dev); + } return arp_req_set_proxy(net, dev, 0); } -static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device *dev) +static int arp_req_delete(struct net *net, struct arpreq *r) { + struct net_device *dev; __be32 ip; + dev = arp_req_dev(net, r); + if (IS_ERR(dev)) + return PTR_ERR(dev); + if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!dev) { - struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); - if (IS_ERR(rt)) - return PTR_ERR(rt); - dev = rt->dst.dev; - ip_rt_put(rt); - if (!dev) - return -EINVAL; - } + return arp_invalidate(dev, ip, true); } @@ -1205,9 +1259,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { - int err; struct arpreq r; - struct net_device *dev = NULL; + __be32 *netmask; + int err; switch (cmd) { case SIOCDARP: @@ -1230,42 +1284,34 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; + + netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) - ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = - htonl(0xFFFFFFFFUL); - rtnl_lock(); - if (r.arp_dev[0]) { - err = -ENODEV; - dev = __dev_get_by_name(net, r.arp_dev); - if (!dev) - goto out; - - /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ - if (!r.arp_ha.sa_family) - r.arp_ha.sa_family = dev->type; - err = -EINVAL; - if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) - goto out; - } else if (cmd == SIOCGARP) { - err = -ENODEV; - goto out; - } + *netmask = htonl(0xFFFFFFFFUL); + else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) + return -EINVAL; switch (cmd) { case SIOCDARP: - err = arp_req_delete(net, &r, dev); + rtnl_lock(); + err = arp_req_delete(net, &r); + rtnl_unlock(); break; case SIOCSARP: - err = arp_req_set(net, &r, dev); + rtnl_lock(); + err = arp_req_set(net, &r); + rtnl_unlock(); break; case SIOCGARP: - err = arp_req_get(&r, dev); + rcu_read_lock(); + err = arp_req_get(net, &r); + rcu_read_unlock(); + + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; break; } -out: - rtnl_unlock(); - if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) - err = -EFAULT; + return err; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7a437f0d4190..a612c57b61c5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -224,6 +224,7 @@ static struct in_ifaddr *inet_alloc_ifa(void) static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); + if (ifa->ifa_dev) in_dev_put(ifa->ifa_dev); kfree(ifa); @@ -231,7 +232,11 @@ static void inet_rcu_free_ifa(struct rcu_head *head) static void inet_free_ifa(struct in_ifaddr *ifa) { - call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); + /* Our reference to ifa->ifa_dev must be freed ASAP + * to release the reference to the netdev the same way. + * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() + */ + call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) @@ -2515,7 +2520,7 @@ static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; + struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -2578,7 +2583,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, if (!t) goto out; - for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; @@ -2652,7 +2657,6 @@ static struct ctl_table ctl_forward_entry[] = { .extra1 = &ipv4_devconf, .extra2 = &init_net, }, - { }, }; #endif @@ -2749,7 +2753,7 @@ err_alloc_all: static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d33d12421814..3968d3f98e08 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -20,6 +20,7 @@ #include <net/udp.h> #include <net/tcp.h> #include <net/espintcp.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -114,7 +115,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP @@ -347,7 +348,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -362,12 +362,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -423,7 +417,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -775,7 +768,6 @@ int esp_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -1179,9 +1171,6 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5eb1b8d302bb..f669da98d11d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -254,7 +254,7 @@ void free_fib_info(struct fib_info *fi) return; } - call_rcu(&fi->rcu, free_fib_info_rcu); + call_rcu_hurry(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c index 06e5572f296f..54984f3170a8 100644 --- a/net/ipv4/fou_bpf.c +++ b/net/ipv4/fou_bpf.c @@ -64,7 +64,7 @@ __bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, info->encap.type = TUNNEL_ENCAP_NONE; } - if (info->key.tun_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM; info->encap.sport = encap->sport; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 3757fd93523f..6701a98d9a9f 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -73,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; - tpi->flags = gre_flags_to_tnl_flags(greh->flags); + gre_flags_to_tnl_flags(tpi->flags, greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 437e782b9663..ab6d0d98dbc3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,8 @@ #include <net/ip_fib.h> #include <net/l3mdev.h> #include <net/addrconf.h> +#define CREATE_TRACE_POINTS +#include <trace/events/icmp.h> /* * Build xmit assembly blocks @@ -483,6 +485,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct icmp_bxm *param) { struct net_device *route_lookup_dev; + struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -508,16 +511,17 @@ static struct rtable *icmp_route_lookup(struct net *net, /* No need to clone since we're just using its address. */ rt2 = rt; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(fl4), NULL, 0); - if (!IS_ERR(rt)) { + dst = xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + rt = dst_rtable(dst); + if (!IS_ERR(dst)) { if (rt != rt2) return rt; - } else if (PTR_ERR(rt) == -EPERM) { + } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; - } else + } else { return rt; - + } err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; @@ -551,19 +555,19 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4_dec), NULL, - XFRM_LOOKUP_ICMP); - if (!IS_ERR(rt2)) { + dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + rt2 = dst_rtable(dst2); + if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; - } else if (PTR_ERR(rt2) == -EPERM) { + } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { - err = PTR_ERR(rt2); + err = PTR_ERR(dst2); goto relookup_failed; } return rt; @@ -768,6 +772,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (!fl4.saddr) fl4.saddr = htonl(INADDR_DUMMY); + trace_icmp_send(skb_in, type, code); + icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 717e97a389a8..9bf09de6a2e7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1842,7 +1842,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, - 0, 0, 0); + 0, 0, 0, + RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c88c9034d630..faaec92a46ac 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -175,7 +175,7 @@ static void fqdir_free_fn(struct work_struct *work) } } -static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); +static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { @@ -184,7 +184,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_work(system_wq, &fqdir_free_work); + queue_delayed_work(system_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index cf88eca5f1b4..48d0d494185b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -565,7 +565,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (sk->sk_protocol == IPPROTO_TCP && + tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e8de45d34d56..e28075f0006e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -264,14 +264,18 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo) { + struct inet_ehash_bucket *head = &hashinfo->ehash[0]; + unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; - for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + for (slot = 0; slot <= ehash_mask; slot++, head++) { + if (hlist_nulls_empty(&head->chain)) + continue; + restart_rcu: cond_resched(); rcu_read_lock(); @@ -283,15 +287,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count)) + if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely(sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count))) { + if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb947d1613fe..08e2c92e25ab 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -580,7 +580,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, - { } }; /* secret interval has been deprecated */ @@ -593,7 +592,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) @@ -632,7 +630,7 @@ err_alloc: static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 57ddcd8c62f6..ba205473522e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -265,6 +265,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; struct erspan_base_hdr *ershdr; + IP_TUNNEL_DECLARE_FLAGS(flags); struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -272,12 +273,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int ver; int len; + ip_tunnel_flags_copy(flags, tpi->flags); + itn = net_generic(net, erspan_net_id); iph = ip_hdr(skb); if (is_erspan_type1(gre_hdr_len)) { ver = 0; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); } else { if (unlikely(!pskb_may_pull(skb, @@ -287,8 +290,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, + __set_bit(IP_TUNNEL_KEY_BIT, flags); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, tpi->key); } @@ -312,10 +315,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, @@ -338,7 +340,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, ERSPAN_V2_MDSIZE); info = &tun_dst->u.tun_info; - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); } @@ -381,10 +384,13 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, tnl_params = &tunnel->parms.iph; if (tunnel->collect_md || tnl_params->daddr == 0) { - __be16 flags; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __be64 tun_id; - flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + ip_tunnel_flags_and(flags, tpi->flags, flags); + tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) @@ -464,12 +470,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags = tunnel->parms.o_flags; + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -483,10 +492,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tunnel_hlen; - __be16 flags; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -500,14 +509,19 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; /* Push Tunnel header. */ - if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto err_free_skb; - flags = tun_info->key.tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags); + gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -521,6 +535,7 @@ err_free_skb: static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct erspan_metadata *md; @@ -536,7 +551,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto err_free_skb; if (tun_info->options_len < sizeof(*md)) goto err_free_skb; @@ -589,8 +604,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; } - gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -664,7 +680,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, tnl_params = &tunnel->parms.iph; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); @@ -706,7 +723,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, /* Push ERSPAN header */ if (tunnel->erspan_ver == 0) { proto = htons(ETH_P_ERSPAN); - tunnel->parms.o_flags &= ~TUNNEL_SEQ; + __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags); } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, @@ -721,7 +738,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, goto free_skb; } - tunnel->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags); __gre_xmit(skb, dev, &tunnel->parms.iph, proto); return NETDEV_TX_OK; @@ -744,7 +761,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } - if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + tunnel->parms.o_flags))) goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) @@ -762,7 +780,6 @@ free_skb: static void ipgre_link_update(struct net_device *dev, bool set_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); - __be16 flags; int len; len = tunnel->tun_hlen; @@ -776,12 +793,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) dev->needed_headroom += len; if (set_mtu) - dev->mtu = max_t(int, dev->mtu - len, 68); - - flags = tunnel->parms.o_flags; + WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68)); - if (flags & TUNNEL_SEQ || - (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) { + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) || + (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE)) { dev->features &= ~NETIF_F_GSO_SOFTWARE; dev->hw_features &= ~NETIF_F_GSO_SOFTWARE; } else { @@ -790,20 +806,29 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, +static int ipgre_tunnel_ctl(struct net_device *dev, + struct ip_tunnel_parm_kern *p, int cmd) { + __be16 i_flags, o_flags; int err; + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + i_flags = ip_tunnel_flags_to_be16(p->i_flags); + o_flags = ip_tunnel_flags_to_be16(p->o_flags); + if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || - ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) + ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p->i_flags = gre_flags_to_tnl_flags(p->i_flags); - p->o_flags = gre_flags_to_tnl_flags(p->o_flags); + gre_flags_to_tnl_flags(p->i_flags, i_flags); + gre_flags_to_tnl_flags(p->o_flags, o_flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) @@ -812,15 +837,18 @@ static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); - p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + ip_tunnel_flags_from_be16(p->i_flags, i_flags); + o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); + ip_tunnel_flags_from_be16(p->o_flags, o_flags); + return 0; } @@ -960,7 +988,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; - __be16 flags; tunnel = netdev_priv(dev); tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); @@ -972,14 +999,13 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES | NETIF_F_LLTX; dev->hw_features |= GRE_FEATURES; - flags = tunnel->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) && + tunnel->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1136,7 +1162,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1152,10 +1178,12 @@ static int ipgre_netlink_parms(struct net_device *dev, parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1203,7 +1231,7 @@ static int ipgre_netlink_parms(struct net_device *dev, static int erspan_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -1362,7 +1390,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1380,7 +1408,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1399,8 +1427,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1415,8 +1443,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); ipgre_link_update(dev, !tb[IFLA_MTU]); @@ -1428,8 +1456,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; int err; err = ipgre_newlink_encap_setup(dev, data); @@ -1444,8 +1472,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], if (err < 0) return err; - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags); return 0; } @@ -1501,8 +1529,10 @@ static size_t ipgre_get_size(const struct net_device *dev) static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + struct ip_tunnel_parm_kern *p = &t->parms; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -1550,7 +1580,7 @@ static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (t->erspan_ver <= 2) { if (t->erspan_ver != 0 && !t->collect_md) - t->parms.o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) goto nla_put_failure; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 5e9c8156656a..d6fbcbd2358a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -616,7 +616,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, - ((struct rtable *)dst)->rt_type); + dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 39229fd0601a..9500031a1f55 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; @@ -475,7 +475,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, goto packet_routed; /* Make sure we can route this packet. */ - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { __be32 daddr; @@ -971,7 +971,7 @@ static int __ip_append_data(struct sock *sk, bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1390,7 +1390,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)cork->dst; + struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1b8d8ff9a237..bccef2fcf620 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,17 +56,13 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, - __be16 flags, __be32 key) +static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p, + const unsigned long *flags, __be32 key) { - if (p->i_flags & TUNNEL_KEY) { - if (flags & TUNNEL_KEY) - return key == p->i_key; - else - /* key expected, none present */ - return false; - } else - return !(flags & TUNNEL_KEY); + if (!test_bit(IP_TUNNEL_KEY_BIT, flags)) + return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags); + + return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key; } /* Fallback tunnel: no source, no destination, no key, no options @@ -81,7 +77,7 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, + int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key) { @@ -143,7 +139,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, } hlist_for_each_entry_rcu(t, head, hash_node) { - if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || + if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) && + t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) @@ -171,7 +168,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { unsigned int h; __be32 remote; @@ -182,7 +179,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, else remote = 0; - if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) + if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) && + test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags)) i_key = 0; h = ip_tunnel_hash(i_key, remote); @@ -206,17 +204,19 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; + IP_TUNNEL_DECLARE_FLAGS(flags); __be32 key = parms->i_key; - __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); + ip_tunnel_flags_copy(flags, parms->i_flags); + hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && @@ -230,7 +230,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { int err; struct ip_tunnel *tunnel; @@ -326,7 +326,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { struct ip_tunnel *nt; struct net_device *dev; @@ -386,15 +386,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, } #endif - if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || - ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags&TUNNEL_SEQ) { - if (!(tpi->flags&TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); @@ -543,7 +543,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rt6_info *rt6; __be32 daddr; - rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : + rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) : NULL; daddr = md ? dst : tunnel->parms.iph.daddr; @@ -638,7 +638,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { @@ -871,7 +871,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, - struct ip_tunnel_parm *p, + struct ip_tunnel_parm_kern *p, bool set_mtu, __u32 fwmark) { @@ -897,13 +897,14 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) - dev->mtu = mtu; + WRITE_ONCE(dev->mtu, mtu); } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } -int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -927,10 +928,10 @@ int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); - if (!(p->i_flags & VTI_ISVTI)) { - if (!(p->i_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) { + if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags)) p->i_key = 0; - if (!(p->o_flags & TUNNEL_KEY)) + if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags)) p->o_key = 0; } @@ -1005,16 +1006,58 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); +bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, + const void __user *data) +{ + struct ip_tunnel_parm p; + + if (copy_from_user(&p, data, sizeof(p))) + return false; + + strscpy(kp->name, p.name); + kp->link = p.link; + ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags); + ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags); + kp->i_key = p.i_key; + kp->o_key = p.o_key; + memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph))); + + return true; +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user); + +bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp) +{ + struct ip_tunnel_parm p; + + if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) || + !ip_tunnel_flags_is_be16_compat(kp->o_flags)) + return false; + + memset(&p, 0, sizeof(p)); + + strscpy(p.name, kp->name); + p.link = kp->link; + p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags); + p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags); + p.i_key = kp->i_key; + p.o_key = kp->o_key; + memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph))); + + return !copy_to_user(data, &p, sizeof(p)); +} +EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user); + int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { - struct ip_tunnel_parm p; + struct ip_tunnel_parm_kern p; int err; - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(data, &p, sizeof(p))) + if (!err && !ip_tunnel_parm_to_user(data, &p)) return -EFAULT; return err; } @@ -1039,7 +1082,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) new_mtu = max_mtu; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); @@ -1077,7 +1120,7 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip_tunnel_get_link_net); @@ -1093,7 +1136,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; unsigned int i; itn->rtnl_link_ops = ops; @@ -1171,7 +1214,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1225,7 +1268,7 @@ err_register_netdevice: EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p, __u32 fwmark) + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 80ccd6661aa3..a3676155be78 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) { + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; struct metadata_dst *res; struct ip_tunnel_info *dst, *src; @@ -144,10 +145,10 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, sizeof(struct in6_addr)); else dst->key.u.ipv4.dst = src->key.u.ipv4.src; - dst->key.tun_flags = src->key.tun_flags; + ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags); dst->mode = src->mode | IP_TUNNEL_INFO_TX; ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src), - src->options_len, 0); + src->options_len, tun_flags); return res; } @@ -497,7 +498,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr, opt->opt_class = nla_get_be16(attr); attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE]; opt->type = nla_get_u8(attr); - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); } return sizeof(struct geneve_opt) + data_len; @@ -525,7 +526,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr, attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP]; md->gbp = nla_get_u32(attr); md->gbp &= VXLAN_GBP_MASK; - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct vxlan_metadata); @@ -574,7 +575,7 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr, set_hwid(&md->u.md2, nla_get_u8(attr)); } - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); } return sizeof(struct erspan_metadata); @@ -585,7 +586,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, { int err, rem, opt_len, opts_len = 0; struct nlattr *nla; - __be16 type = 0; + u32 type = 0; if (!attr) return 0; @@ -598,7 +599,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { switch (nla_type(nla)) { case LWTUNNEL_IP_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len, extack); @@ -607,7 +608,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) return -EINVAL; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case LWTUNNEL_IP_OPTS_VXLAN: if (type) @@ -617,7 +618,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case LWTUNNEL_IP_OPTS_ERSPAN: if (type) @@ -627,7 +628,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; default: return -EINVAL; @@ -705,10 +706,16 @@ static int ip_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + + ip_tunnel_flags_from_be16(flags, + nla_get_be16(tb[LWTUNNEL_IP_FLAGS])); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options_len = opt_len; @@ -812,18 +819,18 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type, struct nlattr *nest; int err = 0; - if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(tun_info->key.tun_flags)) return 0; nest = nla_nest_start_noflag(skb, type); if (!nest) return -ENOMEM; - if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_geneve(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT) + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_vxlan(skb, tun_info); - else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT) + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) err = ip_tun_fill_encap_opts_erspan(skb, tun_info); if (err) { @@ -846,7 +853,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info)) return -ENOMEM; @@ -857,11 +865,11 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) { int opt_len; - if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)) + if (!ip_tunnel_is_options_present(info->key.tun_flags)) return 0; opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */ - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { struct geneve_opt *opt; int offset = 0; @@ -874,10 +882,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info) /* OPT_GENEVE_DATA */ offset += sizeof(*opt) + opt->length * 4; } - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */ + nla_total_size(4); /* OPT_VXLAN_GBP */ - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { struct erspan_metadata *md = ip_tunnel_info_opts(info); opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */ @@ -984,10 +992,17 @@ static int ip6_tun_build_state(struct net *net, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_FLAGS]) - tun_info->key.tun_flags |= - (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) & - ~TUNNEL_OPTIONS_PRESENT); + if (tb[LWTUNNEL_IP6_FLAGS]) { + IP_TUNNEL_DECLARE_FLAGS(flags); + __be16 data; + + data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]); + ip_tunnel_flags_from_be16(flags, data); + ip_tunnel_clear_options_present(flags); + + ip_tunnel_flags_or(tun_info->key.tun_flags, + tun_info->key.tun_flags, flags); + } tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options_len = opt_len; @@ -1008,7 +1023,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || - nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) || + nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, + ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) || ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info)) return -ENOMEM; @@ -1116,7 +1132,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); void ip_tunnel_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm_kern *parms) { if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); @@ -1139,8 +1155,12 @@ void ip_tunnel_netlink_parms(struct nlattr *data[], if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); - if (data[IFLA_IPTUN_FLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + if (data[IFLA_IPTUN_FLAGS]) { + __be16 flags; + + flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + ip_tunnel_flags_from_be16(parms->i_flags, flags); + } if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index ee587adb169f..14536da9f5dc 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -51,8 +51,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -167,7 +170,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parms = &tunnel->parms; + struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; @@ -322,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; @@ -373,8 +379,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { @@ -383,20 +390,26 @@ vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) return -EINVAL; } - if (!(p->i_flags & GRE_KEY)) + if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || + !ip_tunnel_flags_is_be16_compat(p->o_flags)) + return -EOVERFLOW; + + if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; - if (!(p->o_flags & GRE_KEY)) + if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; - p->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, flags); + ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p->i_flags |= GRE_KEY; - p->o_flags |= GRE_KEY; + ip_tunnel_flags_from_be16(flags, GRE_KEY); + ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); + ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } @@ -531,7 +544,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -541,7 +554,7 @@ static void vti_netlink_parms(struct nlattr *data[], if (!data) return; - parms->i_flags = VTI_ISVTI; + __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); @@ -566,7 +579,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip_tunnel_parm parms; + struct ip_tunnel_parm_kern parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); @@ -578,8 +591,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; - struct ip_tunnel_parm p; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); @@ -606,7 +619,7 @@ static size_t vti_get_size(const struct net_device *dev) static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; + struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f2696eaadbe6..923a2ef68c2f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -130,13 +130,16 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); const struct iphdr *iph = (const struct iphdr *)skb->data; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; int err = 0; - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + + t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, + iph->saddr, 0); if (!t) { err = -ENOENT; goto out; @@ -213,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; + __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); + iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->saddr, iph->daddr, 0); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, + iph->daddr, 0); if (tunnel) { const struct tnl_ptk_info *tpi; @@ -238,7 +244,9 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (tunnel->collect_md) { - tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); + ip_tunnel_flags_zero(flags); + + tun_dst = ip_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) return 0; ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); @@ -330,7 +338,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || @@ -340,7 +348,8 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } p->i_key = p->o_key = 0; - p->i_flags = p->o_flags = 0; + ip_tunnel_flags_zero(p->i_flags); + ip_tunnel_flags_zero(p->o_flags); return ip_tunnel_ctl(dev, p, cmd); } @@ -405,8 +414,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md, - __u32 *fwmark) + struct ip_tunnel_parm_kern *parms, + bool *collect_md, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -432,8 +441,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; __u32 fwmark = 0; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { @@ -452,8 +461,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; bool collect_md; __u32 fwmark = t->fwmark; @@ -510,7 +519,7 @@ static size_t ipip_get_size(const struct net_device *dev) static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5c01c8489f..6c750bd13dd8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -441,7 +441,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 914bc9c35cc7..6c4664c681ca 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -33,6 +33,7 @@ #include <net/protocol.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b814fdab19f7..5fd54103174f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -106,9 +106,6 @@ #include "fib_lookup.h" -#define RT_FL_TOS(oldflp4) \ - ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) - #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) @@ -498,15 +495,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void ip_rt_fix_tos(struct flowi4 *fl4) -{ - __u8 tos = RT_FL_TOS(fl4); - - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - if (tos & RTO_ONLINK) - fl4->flowi4_scope = RT_SCOPE_LINK; -} - static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, @@ -831,7 +819,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf u32 mark = skb->mark; __u8 tos = iph->tos; - rt = (struct rtable *) dst; + rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); @@ -839,7 +827,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; + struct rtable *rt = dst_rtable(dst); struct dst_entry *ret = dst; if (rt) { @@ -1056,7 +1044,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); @@ -1127,7 +1115,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); - rt = (struct rtable *)odst; + rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) @@ -1136,7 +1124,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1193,7 +1181,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -1528,10 +1516,8 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - ip_dst_metrics_put(dst); - rt_del_uncached_list(rt); + rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) @@ -2639,7 +2625,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - ip_rt_fix_tos(fl4); + fl4->flowi4_tos &= IPTOS_RT_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -2832,7 +2818,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); @@ -2877,9 +2863,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; - rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0)); } return rt; @@ -3510,7 +3496,6 @@ static struct ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static const char ipv4_route_flush_procname[] = "flush"; @@ -3544,7 +3529,6 @@ static struct ctl_table ipv4_route_netns_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; static __net_init int sysctl_route_net_init(struct net *net) @@ -3562,16 +3546,14 @@ static __net_init int sysctl_route_net_init(struct net *net) /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { - if (tbl[0].procname != ipv4_route_flush_procname) { - tbl[0].procname = NULL; + if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; - } } /* Update the variables to point into the current struct net * except for the first element flush */ - for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++) + for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; @@ -3591,7 +3573,7 @@ err_dup: static __net_exit void sysctl_route_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 500f665f98cb..b61d36810fe3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } /* Try to redo what tcp_v4_send_synack did. */ - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : + dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dcc1..162a0a3b6ba5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -575,7 +575,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { } }; static struct ctl_table ipv4_net_table[] = { @@ -1502,11 +1501,11 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, - { } }; static __net_init int ipv4_sysctl_init_net(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv4_net_table); struct ctl_table *table; table = ipv4_net_table; @@ -1517,7 +1516,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net @@ -1533,7 +1532,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, - ARRAY_SIZE(ipv4_net_table)); + table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; @@ -1554,7 +1553,7 @@ err_alloc: static __net_exit void ipv4_sysctl_exit_net(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 66d77faca64f..06aab937d60a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,13 +272,16 @@ #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> +#include <net/rstreason.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/hotdata.h> #include <net/rps.h> /* Track pending CMSGs. */ @@ -290,6 +293,9 @@ enum { DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(u32, tcp_tw_isn); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); + long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); @@ -1184,7 +1190,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= READ_ONCE(sysctl_max_skb_frags)) { + if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1416,8 +1422,6 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) struct sk_buff *skb; int copied = 0, err = 0; - /* XXX -- need to support SO_PEEK_OFF */ - skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -1721,7 +1725,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); - tcp_sk(sk)->window_clamp = val; + WRITE_ONCE(tcp_sk(sk)->window_clamp, val); } return 0; } @@ -2328,6 +2332,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; + u32 peek_offset = 0; u32 urg_hole = 0; err = -ENOTCONN; @@ -2361,7 +2366,8 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, seq = &tp->copied_seq; if (flags & MSG_PEEK) { - peek_seq = tp->copied_seq; + peek_offset = max(sk_peek_offset(sk, flags), 0); + peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } @@ -2464,11 +2470,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, } if ((flags & MSG_PEEK) && - (peek_seq - copied - urg_hole != tp->copied_seq)) { + (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); - peek_seq = tp->copied_seq; + peek_seq = tp->copied_seq + peek_offset; } continue; @@ -2509,7 +2515,10 @@ found_ok_skb: WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; - + if (flags & MSG_PEEK) + sk_peek_offset_fwd(sk, used); + else + sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: @@ -2744,7 +2753,15 @@ static bool tcp_too_many_orphans(int shift) READ_ONCE(sysctl_tcp_max_orphans); } -bool tcp_check_oom(struct sock *sk, int shift) +static bool tcp_out_of_memory(const struct sock *sk) +{ + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) + return true; + return false; +} + +bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; @@ -2805,7 +2822,8 @@ void __tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, sk->sk_allocation); + tcp_send_active_reset(sk, sk->sk_allocation, + SK_RST_REASON_NOT_SPECIFIED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2879,7 +2897,8 @@ adjudge_to_death: struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2897,7 +2916,8 @@ adjudge_to_death: if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3001,7 +3021,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any()); + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -3010,6 +3030,7 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); + sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); @@ -3379,7 +3400,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; - tp->window_clamp = 0; + WRITE_ONCE(tp->window_clamp, 0); } else { u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? @@ -3388,7 +3409,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) if (new_window_clamp == old_window_clamp) return 0; - tp->window_clamp = new_window_clamp; + WRITE_ONCE(tp->window_clamp, new_window_clamp); if (new_window_clamp < old_window_clamp) { /* need to apply the reserved mem provisioning only * when shrinking the window clamp @@ -4057,7 +4078,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: - val = tp->window_clamp; + val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; @@ -4557,7 +4578,8 @@ int tcp_abort(struct sock *sk, int err) smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); } @@ -4648,16 +4670,16 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); @@ -4670,7 +4692,11 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); + + /* 32bit arches with 8byte alignment on u64 fields might need padding + * before tcp_clock_cache. + */ + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 05dc2d05bc7c..7e52ab24e40a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1156,8 +1156,6 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { }; BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -1166,8 +1164,6 @@ BTF_ID_FLAGS(func, bbr_cwnd_event) BTF_ID_FLAGS(func, bbr_ssthresh) BTF_ID_FLAGS(func, bbr_min_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) -#endif -#endif BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 44869ea089e3..5dbed91c6178 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -486,16 +486,12 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { }; BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, cubictcp_init) BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh) BTF_ID_FLAGS(func, cubictcp_cong_avoid) BTF_ID_FLAGS(func, cubictcp_state) BTF_ID_FLAGS(func, cubictcp_cwnd_event) BTF_ID_FLAGS(func, cubictcp_acked) -#endif -#endif BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index e33fbe4933e4..6b712a33d49f 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -261,16 +261,12 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) -#endif -#endif BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a140d9f7a0a3..4ccfec4bb624 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include <linux/prefetch.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> @@ -563,19 +564,20 @@ static void tcp_init_buffer_space(struct sock *sk) maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { - tp->window_clamp = maxwin; + WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) - tp->window_clamp = max(maxwin - - (maxwin >> tcp_app_win), - 4 * tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(maxwin - (maxwin >> tcp_app_win), + 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) - tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + WRITE_ONCE(tp->window_clamp, + max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; @@ -773,7 +775,8 @@ void tcp_rcv_space_adjust(struct sock *sk) WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ - tp->window_clamp = tcp_win_from_space(sk, rcvbuf); + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); } } tp->rcvq_space.space = copied; @@ -911,7 +914,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ @@ -921,7 +924,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; - tcp_bpf_rtt(sk); + tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } @@ -4803,10 +4806,8 @@ static bool tcp_try_coalesce(struct sock *sk, if (!mptcp_skb_can_collapse(to, from)) return false; -#ifdef CONFIG_TLS_DEVICE - if (from->decrypted != to->decrypted) + if (skb_cmp_decrypted(from, to)) return false; -#endif if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -5174,6 +5175,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + /* Some stacks are known to send bare FIN packets + * in a loop even if we send RWIN 0 in our ACK. + * Accepting this FIN does not hurt memory pressure + * because the FIN flag will simply be merged to the + * receive queue tail skb in most cases. + */ + if (!skb->len && + (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) + goto queue_and_out; + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; @@ -5188,7 +5199,7 @@ queue_and_out: inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - if (skb_queue_len(&sk->sk_receive_queue)) { + if (skb_queue_len(&sk->sk_receive_queue) && skb->len) { reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; @@ -5375,9 +5386,7 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); -#ifdef CONFIG_TLS_DEVICE - nskb->decrypted = skb->decrypted; -#endif + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -5407,10 +5416,8 @@ restart: !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; -#ifdef CONFIG_TLS_DEVICE - if (skb->decrypted != nskb->decrypted) + if (skb_cmp_decrypted(skb, nskb)) goto end; -#endif } } } @@ -6426,7 +6433,8 @@ consume: if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; - tp->window_clamp = min(tp->window_clamp, 65535U); + WRITE_ONCE(tp->window_clamp, + min(tp->window_clamp, 65535U)); } if (tp->rx_opt.saw_tstamp) { @@ -7001,7 +7009,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) +static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -7102,7 +7110,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct sock *sk, struct sk_buff *skb) { struct tcp_fastopen_cookie foc = { .len = -1 }; - __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -7112,21 +7119,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct dst_entry *dst; struct flowi fl; u8 syncookies; + u32 isn; #ifdef CONFIG_TCP_AO const struct tcp_ao_hdr *aoh; #endif - syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); + isn = __this_cpu_read(tcp_tw_isn); + if (isn) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + __this_cpu_write(tcp_tw_isn, 0); + } else { + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); - /* TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); - if (!want_cookie) - goto drop; + if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) { + want_cookie = tcp_syn_flood_action(sk, + rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } } if (sk_acceptq_is_full(sk)) { @@ -7165,7 +7179,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); - dst = af_ops->route_req(sk, skb, &fl, req); + dst = af_ops->route_req(sk, skb, &fl, req, isn); if (!dst) goto drop_and_free; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e0cef75f85fb..108a438dc247 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -70,6 +70,7 @@ #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -729,7 +730,8 @@ out: * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct { @@ -872,11 +874,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * routing might fail in this case. No choice here, if we choose to force * input interface, we will misroute in case of asymmetric route. */ - if (sk) { + if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - if (sk_fullsock(sk)) - trace_tcp_send_reset(sk, skb); - } + + trace_tcp_send_reset(sk, skb, reason); BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); @@ -1673,7 +1674,8 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { tcp_v4_init_req(req, sk, skb); @@ -1940,7 +1942,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - tcp_v4_send_reset(rsk, skb); + tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: kfree_skb_reason(skb, reason); /* Be careful here. If this function gets more complicated and @@ -2001,7 +2003,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { - u32 limit, tail_gso_size, tail_gso_segs; + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -2010,6 +2012,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, bool fragstolen; u32 gso_segs; u32 gso_size; + u64 limit; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -2051,10 +2054,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || -#ifdef CONFIG_TLS_DEVICE - tail->decrypted != skb->decrypted || -#endif !mptcp_skb_can_collapse(tail, skb) || + skb_cmp_decrypted(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2107,7 +2108,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: - limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* sk->sk_backlog.len is reset only at the end of __release_sock(). + * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach + * sk_rcvbuf in normal conditions. + */ + limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1; + + limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. @@ -2115,6 +2122,8 @@ no_coalesce: */ limit += 64 * 1024; + limit = min_t(u64, limit, UINT_MAX); + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); *reason = SKB_DROP_REASON_SOCKET_BACKLOG; @@ -2154,7 +2163,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2176,6 +2184,7 @@ int tcp_v4_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2213,7 +2222,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -2285,7 +2293,10 @@ process: } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { - tcp_v4_send_reset(nsk, skb); + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v4_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); @@ -2293,6 +2304,7 @@ process: } } +process: if (static_branch_unlikely(&ip4_min_ttl)) { /* min_ttl can be changed concurrently from do_ip_setsockopt() */ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { @@ -2363,7 +2375,7 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v4_send_reset(NULL, skb); + tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: @@ -2391,7 +2403,7 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2405,6 +2417,7 @@ do_time_wait: sk = sk2; tcp_v4_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } @@ -2414,7 +2427,7 @@ do_time_wait: tcp_v4_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v4_send_reset(sk, skb); + tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason)); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS:; @@ -2424,7 +2437,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor= tcp_twsk_destructor, }; @@ -3507,7 +3519,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c2a925538542..e93df98de3f4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -766,6 +766,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; + int res = 0; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; @@ -778,7 +779,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, continue; if (col < s_col) continue; - if (tcp_metrics_dump_info(skb, cb, tm) < 0) { + res = tcp_metrics_dump_info(skb, cb, tm); + if (res < 0) { rcu_read_unlock(); goto done; } @@ -789,7 +791,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, done: cb->args[0] = row; cb->args[1] = col; - return skb->len; + return res; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, @@ -986,6 +988,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, + .parallel_ops = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0761f060a83..7d543569a180 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -22,6 +22,7 @@ #include <net/tcp.h> #include <net/xfrm.h> #include <net/busy_poll.h> +#include <net/rstreason.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -95,7 +96,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th) + const struct tcphdr *th, u32 *tw_isn) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -228,7 +229,7 @@ kill: u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; - TCP_SKB_CB(skb)->tcp_tw_isn = isn; + *tw_isn = isn; return TCP_TW_SYN; } @@ -388,7 +389,7 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -void tcp_twsk_purge(struct list_head *net_exit_list, int family) +void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; @@ -396,14 +397,13 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ - inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { - inet_twsk_purge(&tcp_hashinfo, family); + inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } -EXPORT_SYMBOL_GPL(tcp_twsk_purge); /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. @@ -879,7 +879,7 @@ embryonic_reset: * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ - req->rsk_ops->send_reset(sk, skb); + req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index ebe4722bb020..c90704befd7b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -28,6 +28,70 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, } } +static void __tcpv4_gso_segment_csum(struct sk_buff *seg, + __be32 *oldip, __be32 newip, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th; + struct iphdr *iph; + + if (*oldip == newip && *oldport == newport) + return; + + th = tcp_hdr(seg); + iph = ip_hdr(seg); + + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true); + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; + + csum_replace4(&iph->check, *oldip, newip); + *oldip = newip; +} + +static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct iphdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct iphdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ip_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ip_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ip_hdr(seg); + + __tcpv4_gso_segment_csum(seg, + &iph2->saddr, iph->saddr, + &th2->source, th->source); + __tcpv4_gso_segment_csum(seg, + &iph2->daddr, iph->daddr, + &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv4_gso_segment_list_csum(skb); +} + static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -37,6 +101,9 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) + return __tcp4_gso_segment_list(skb, features); + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); @@ -178,61 +245,76 @@ out: return segs; } -struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) +struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) { - struct sk_buff *pp = NULL; + struct tcphdr *th2; struct sk_buff *p; + + list_for_each_entry(p, head, list) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + return p; + } + + return NULL; +} + +struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) +{ + unsigned int thlen, hlen, off; struct tcphdr *th; - struct tcphdr *th2; - unsigned int len; - unsigned int thlen; - __be32 flags; - unsigned int mss = 1; - unsigned int hlen; - unsigned int off; - int flush = 1; - int i; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; thlen = th->doff * 4; if (thlen < sizeof(*th)) - goto out; + return NULL; hlen = off + thlen; if (!skb_gro_may_pull(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) - goto out; + return NULL; } skb_gro_pull(skb, thlen); - len = skb_gro_len(skb); - flags = tcp_flag_word(th); - - list_for_each_entry(p, head, list) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; + return th; +} - th2 = tcp_hdr(p); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ + unsigned int thlen = th->doff * 4; + struct sk_buff *pp = NULL; + struct sk_buff *p; + struct tcphdr *th2; + unsigned int len; + __be32 flags; + unsigned int mss = 1; + int flush = 1; + int i; - if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } + len = skb_gro_len(skb); + flags = tcp_flag_word(th); - goto found; - } - p = NULL; - goto out_check_final; + p = tcp_gro_lookup(head, th); + if (!p) + goto out_check_final; -found: /* Include the IP ID check below from the inner most IP hdr */ + th2 = tcp_hdr(p); flush = NAPI_GRO_CB(p)->flush; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & @@ -265,9 +347,19 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); -#ifdef CONFIG_TLS_DEVICE - flush |= p->decrypted ^ skb->decrypted; -#endif + flush |= skb_cmp_decrypted(p, skb); + + if (unlikely(NAPI_GRO_CB(p)->is_flist)) { + flush |= (__force int)(flags ^ tcp_flag_word(th2)); + flush |= skb->ip_summed != p->ip_summed; + flush |= skb->csum_level != p->csum_level; + flush |= NAPI_GRO_CB(p)->count >= 64; + + if (flush || skb_gro_receive_list(p, skb)) + mss = 1; + + goto out_check_final; + } if (flush || skb_gro_receive(p, skb)) { mss = 1; @@ -290,7 +382,6 @@ out_check_final: if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = p; -out: NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; @@ -316,18 +407,58 @@ void tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); +static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ + const struct iphdr *iph; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet_get_iif_sdif(skb, &iif, &sdif); + iph = skb_gro_network_header(skb); + net = dev_net(skb->dev); + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + iph->saddr, th->source, + iph->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_put(sk); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - inet_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + inet_gro_compute_pseudo)) + goto flush; + + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; - return tcp_gro_receive(head, skb); + tcp4_check_fraglist_gro(head, skb, th); + + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) @@ -335,6 +466,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 02caeb7bcf63..95caf8aaa8be 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,11 +39,13 @@ #include <net/tcp.h> #include <net/mptcp.h> +#include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> +#include <linux/skbuff_ref.h> #include <trace/events/tcp.h> @@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, - __u32 *rcv_wnd, __u32 *window_clamp, + __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); + u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ - if (*window_clamp == 0) - (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); - space = min(*window_clamp, space); + if (window_clamp == 0) + window_clamp = (U16_MAX << TCP_MAX_WSCALE); + space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) @@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); - space = min_t(u32, space, *window_clamp); + space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); + WRITE_ONCE(*__window_clamp, + min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) } /* Initialize TSO segments for a packet. */ -static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { + int tso_segs; + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ - tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->tcp_gso_size = 0; - } else { - tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); - TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tcp_skb_pcount_set(skb, 1); + return 1; } + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + tso_segs = DIV_ROUND_UP(skb->len, mss_now); + tcp_skb_pcount_set(skb, tso_segs); + return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various @@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, - const struct sk_buff *skb) +static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; - /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && - tcp_skb_pcount(skb) == 1) - return 1; - in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) @@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { - tcp_set_skb_tso_segs(skb, mss_now); - tso_segs = tcp_skb_pcount(skb); - } + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) + return tcp_set_skb_tso_segs(skb, mss_now); + return tso_segs; } @@ -2403,6 +2404,21 @@ commit: return 0; } +/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if + * all its payload was moved to another one (dst). + * Make sure to transfer tcp_flags, eor, and tstamp. + */ +static void tcp_eat_one_skb(struct sock *sk, + struct sk_buff *dst, + struct sk_buff *src) +{ + TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; + TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; + tcp_skb_collapse_tstamp(dst, src); + tcp_unlink_write_queue(src, sk); + tcp_wmem_free_skb(sk, src); +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2508,16 +2524,7 @@ static int tcp_mtu_probe(struct sock *sk) copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { - /* We've eaten all the data from this skb. - * Throw it away. */ - TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; - /* If this is the last SKB we copy and eor is set - * we need to propagate it to the new skb. - */ - TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; - tcp_skb_collapse_tstamp(nskb, skb); - tcp_unlink_write_queue(skb, sk); - tcp_wmem_free_skb(sk, skb); + tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2683,6 +2690,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } +/* First skb in the write queue is smaller than ideal packet size. + * Check if we can move payload from the second skb in the queue. + */ +static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) +{ + struct sk_buff *next_skb = skb->next; + unsigned int nlen; + + if (tcp_skb_is_last(sk, skb)) + return; + + if (!tcp_skb_can_collapse(skb, next_skb)) + return; + + nlen = min_t(u32, amount, next_skb->len); + if (!nlen || !skb_shift(skb, next_skb, nlen)) + return; + + TCP_SKB_CB(skb)->end_seq += nlen; + TCP_SKB_CB(next_skb)->seq += nlen; + + if (!next_skb->len) { + /* In case FIN is set, we need to update end_seq */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + tcp_eat_one_skb(sk, skb, next_skb); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2703,10 +2739,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; - int cwnd_quota; + u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; - u32 max_segs; sent_pkts = 0; @@ -2724,6 +2759,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; + int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ @@ -2737,10 +2773,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (tcp_pacing_check(sk)) break; - tso_segs = tcp_init_tso_segs(skb, mss_now); - BUG_ON(!tso_segs); - - cwnd_quota = tcp_cwnd_test(tp, skb); + cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ @@ -2748,6 +2781,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else break; } + cwnd_quota = min(cwnd_quota, max_segs); + missing_bytes = cwnd_quota * mss_now - skb->len; + if (missing_bytes > 0) + tcp_grow_skb(sk, skb, missing_bytes); + + tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; @@ -2769,9 +2808,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - max_segs), + cwnd_quota, nonagle); if (skb->len > limit && @@ -3387,11 +3424,6 @@ start: err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } - /* To avoid taking spuriously low RTT samples based on a timestamp - * for a transmit that never happened, always mark EVER_RETRANS - */ - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; - if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); @@ -3401,6 +3433,12 @@ start: } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } + + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + return err; } @@ -3585,7 +3623,8 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ -void tcp_send_active_reset(struct sock *sk, gfp_t priority) +void tcp_send_active_reset(struct sock *sk, gfp_t priority, + enum sk_rst_reason reason) { struct sk_buff *skb; @@ -3610,7 +3649,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL); + trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); } /* Send a crossed SYN-ACK during socket establishment. @@ -3857,7 +3896,7 @@ static void tcp_connect_init(struct sock *sk) tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) - tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); @@ -3865,7 +3904,7 @@ static void tcp_connect_init(struct sock *sk) /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) - tp->window_clamp = tcp_full_space(sk); + WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d1ad20ce1c8c..83fe7f62f7f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,10 +22,11 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> +#include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; @@ -47,7 +48,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; @@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -768,7 +770,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); goto death; } @@ -795,7 +797,8 @@ static void tcp_keepalive_timer (struct timer_list *t) icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { - tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_NOT_SPECIFIED); tcp_write_err(sk); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b32cf2eeeb41..189c9113fe9a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -427,15 +427,21 @@ static struct sock *udp4_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -456,9 +462,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(result, net, saddr, sport, - daddr, hnum, dif, sdif); - + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -1207,7 +1218,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (connected) - rt = (struct rtable *)sk_dst_check(sk, 0); + rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); @@ -1501,13 +1512,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + bool becomes_readable; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1516,7 +1529,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); @@ -1524,12 +1537,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); @@ -1545,12 +1553,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; @@ -2058,8 +2073,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } @@ -2697,8 +2712,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); - fallthrough; - case UDP_ENCAP_ESPINUDP_NON_IKE: #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8721fe5beca2..f6d3c442e260 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -433,33 +433,6 @@ out: return segs; } -static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) -{ - if (unlikely(p->len + skb->len >= 65536)) - return -E2BIG; - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - - skb_pull(skb, skb_gro_offset(skb)); - - NAPI_GRO_CB(p)->last = skb; - NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - - /* sk ownership - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - skb->sk = NULL; - p->truesize += skb->truesize; - p->len += skb->len; - - NAPI_GRO_CB(skb)->same_flow = 1; - - return 0; -} - #define UDP_GRO_CNT_MAX 64 static struct sk_buff *udp_gro_receive_segment(struct list_head *head, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 860aff5f8599..e4e0fa869fa4 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -183,7 +183,8 @@ void udp_tunnel_sock_release(struct socket *sock) EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, int md_size) + const unsigned long *flags, + __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; @@ -199,7 +200,7 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) - info->key.tun_flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 86382e08140e..a620618cc568 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -117,19 +117,6 @@ static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - return -EINVAL; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c33bca2c3841..0294fef577fa 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif, static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rtable *rt = (struct rtable *)xdst->route; + struct rtable *rt = dst_rtable(xdst->route); const struct flowi4 *fl4 = &fl->u.ip4; xdst->u.rt.rt_iif = fl4->flowi4_iif; @@ -152,7 +152,6 @@ static struct ctl_table xfrm4_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __net_init int xfrm4_net_sysctl_init(struct net *net) @@ -186,7 +185,7 @@ err_alloc: static __net_exit void xfrm4_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv4.xfrm4_hdr) return; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 779aa6ecdd49..5c424a0e7232 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -63,6 +63,7 @@ #include <linux/string.h> #include <linux/hash.h> +#include <net/ip_tunnels.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/snmp.h> @@ -2918,7 +2919,7 @@ put: static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, struct in6_ifreq *ireq) { - struct ip_tunnel_parm p = { }; + struct ip_tunnel_parm_kern p = { }; int err; if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) @@ -7183,14 +7184,12 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - /* sentinel */ - } }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { + size_t table_size = ARRAY_SIZE(addrconf_sysctl); int i, ifindex; struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; @@ -7199,7 +7198,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!table) goto out; - for (i = 0; table[i].data; i++) { + for (i = 0; i < table_size; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; /* If one of these is already set, then it is not safe to * overwrite either of them: this makes proc_dointvec_minmax @@ -7214,7 +7213,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); p->sysctl_header = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(addrconf_sysctl)); + table_size); if (!p->sysctl_header) goto free; @@ -7237,7 +7236,7 @@ out: static void __addrconf_sysctl_unregister(struct net *net, struct ipv6_devconf *p, int ifindex) { - struct ctl_table *table; + const struct ctl_table *table; if (!p->sysctl_header) return; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 17ac45aa7194..acd70b5992a7 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - net->ipv6.ip6addrlbl_table.seq++; + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq + 1); return ret; } @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, }; static int ip6addrlbl_fill(struct sk_buff *skb, - struct ip6addrlbl_entry *p, + const struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) @@ -498,7 +499,8 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; - int err; + int err = 0; + u32 lseq; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); @@ -507,10 +509,11 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_lock(); + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - net->ipv6.ip6addrlbl_table.seq, + lseq, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -522,7 +525,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline int ip6addrlbl_msgsize(void) @@ -614,7 +617,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - lseq = net->ipv6.ip6addrlbl_table.seq; + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, @@ -647,6 +650,7 @@ int __init ipv6_addr_label_rtnl_register(void) return ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); return ret; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0f2506e35359..0627c4c18d1a 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -252,9 +252,8 @@ static void aca_free_rcu(struct rcu_head *h) static void aca_put(struct ifacaddr6 *ac) { - if (refcount_dec_and_test(&ac->aca_refcnt)) { - call_rcu(&ac->rcu, aca_free_rcu); - } + if (refcount_dec_and_test(&ac->aca_refcnt)) + call_rcu_hurry(&ac->rcu, aca_free_rcu); } static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7371886d4f9f..34a9a5b9ed00 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include <net/tcp.h> #include <net/espintcp.h> #include <net/inet6_hashtables.h> +#include <linux/skbuff_ref.h> #include <linux/highmem.h> @@ -131,7 +132,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(sg_page(sg), skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP @@ -383,7 +384,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, __be16 dport) { struct udphdr *uh; - __be32 *udpdata32; unsigned int len; len = skb->len + esp->tailen - skb_transport_offset(skb); @@ -398,12 +398,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, *skb_mac_header(skb) = IPPROTO_UDP; - if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - return (struct ip_esp_hdr *)(udpdata32 + 2); - } - return (struct ip_esp_hdr *)(uh + 1); } @@ -459,7 +453,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); break; case TCP_ENCAP_ESPINTCP: @@ -822,7 +815,6 @@ int esp6_input_done2(struct sk_buff *skb, int err) source = th->source; break; case UDP_ENCAP_ESPINUDP: - case UDP_ENCAP_ESPINUDP_NON_IKE: source = uh->source; break; default: @@ -1232,9 +1224,6 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) case UDP_ENCAP_ESPINUDP: x->props.header_len += sizeof(struct udphdr); break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); - break; #ifdef CONFIG_INET6_ESPINTCP case TCP_ENCAP_ESPINTCP: /* only the length field, TCP encap is done by diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1635da07285f..7b31674644ef 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -212,7 +212,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { res = true; } else { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; @@ -241,7 +241,7 @@ static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, dst = ip6_route_output(net, sk, fl6); if (!dst->error) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); @@ -616,7 +616,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, + &ipc6, &fl6, dst_rt6_info(dst), MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -803,7 +803,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT)) { + dst_rt6_info(dst), MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -1206,7 +1206,6 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 8c1ce78956ba..0601bad79822 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel( static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); - struct rt6_info *rt = (struct rt6_info *)orig_dst; + struct rt6_info *rt = dst_rt6_info(orig_dst); struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; @@ -70,7 +70,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = orig_dst->dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst, + fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 2e81383b663b..6db71bb1cd30 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -21,6 +21,7 @@ #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -289,7 +290,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (twsk_unique(sk, sk2, twp)) + if (sk->sk_protocol == IPPROTO_TCP && + tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c1f62352a481..31d77885bcae 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -623,23 +623,22 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, - .filter.rtnl_held = true, + .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; + struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; - struct hlist_head *head; - int res = 0; + unsigned int h, s_h; + int err = 0; + rcu_read_lock(); if (cb->strict_check) { - int err; - err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -654,8 +653,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; @@ -675,46 +676,46 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) - goto out; + goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } if (!cb->args[0]) { - res = fib6_dump_table(tb, skb, cb); - if (!res) + err = fib6_dump_table(tb, skb, cb); + if (!err) cb->args[0] = 1; } - goto out; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) - goto out_unlock; + err = fib6_dump_table(tb, skb, cb); + if (err != 0) + goto out; next: e++; } } -out_unlock: - rcu_read_unlock(); +out: cb->args[1] = e; cb->args[0] = h; -out: - res = res < 0 ? res : skb->len; - if (res <= 0) + +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) @@ -2509,7 +2510,7 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, 0); + inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED); if (ret) goto out_unregister_subsys; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c89aef524df9..3942bd2ade78 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -496,11 +496,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) tpi->proto); if (tunnel) { if (tunnel->parms.collect_md) { + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; __be64 tun_id; - __be16 flags; - flags = tpi->flags; + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); @@ -551,14 +551,14 @@ static int ip6erspan_rcv(struct sk_buff *skb, if (tunnel->parms.collect_md) { struct erspan_metadata *pkt_md, *md; + IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; - __be16 flags; - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; + __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); + ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, @@ -580,7 +580,8 @@ static int ip6erspan_rcv(struct sk_buff *skb, md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + info->key.tun_flags); info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); @@ -748,8 +749,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 protocol; - __be16 flags; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; @@ -781,8 +782,11 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - flags = key->tun_flags & - (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); + ip_tunnel_flags_zero(flags); + __set_bit(IP_TUNNEL_CSUM_BIT, flags); + __set_bit(IP_TUNNEL_KEY_BIT, flags); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + ip_tunnel_flags_and(flags, flags, key->tun_flags); tun_hlen = gre_calc_hlen(flags); if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) @@ -791,19 +795,21 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } else { if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; - flags = tunnel->parms.o_flags; + ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) - : 0); + test_bit(IP_TUNNEL_SEQ_BIT, flags) ? + htonl(atomic_fetch_inc(&tunnel->o_seqno)) : + 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -825,7 +831,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return -1; @@ -859,7 +866,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags))) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, @@ -886,7 +894,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, + t->parms.o_flags)); if (err) return err; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); @@ -939,6 +948,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; @@ -982,7 +992,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; - t->parms.o_flags &= ~TUNNEL_KEY; + __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); IPCB(skb)->flags = 0; /* For collect_md mode, derive fl6 from the tunnel key, @@ -1007,7 +1017,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_info->key.tun_flags)) goto tx_err; if (tun_info->options_len < sizeof(*md)) goto tx_err; @@ -1068,7 +1079,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, } /* Push GRE header. */ - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); + __set_bit(IP_TUNNEL_SEQ_BIT, flags); + gre_build_header(skb, 8, flags, proto, 0, + htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) @@ -1211,8 +1224,8 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, t->parms.proto = p->proto; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; + ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); + ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); t->parms.fwmark = p->fwmark; t->parms.erspan_ver = p->erspan_ver; t->parms.index = p->index; @@ -1241,8 +1254,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = gre_flags_to_tnl_flags(u->i_flags); - p->o_flags = gre_flags_to_tnl_flags(u->o_flags); + gre_flags_to_tnl_flags(p->i_flags, u->i_flags); + gre_flags_to_tnl_flags(p->o_flags, u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1394,7 +1407,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ipv6h->daddr = t->parms.raddr; p = (__be16 *)(ipv6h + 1); - p[0] = t->parms.o_flags; + p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags); p[1] = htons(type); /* @@ -1421,7 +1434,6 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1431,7 +1443,6 @@ static void ip6gre_dev_free(struct net_device *dev) gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); } static void ip6gre_tunnel_setup(struct net_device *dev) @@ -1440,6 +1451,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_IP6GRE; dev->flags |= IFF_NOARP; @@ -1458,19 +1470,17 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - __be16 flags; dev->features |= GRE6_FEATURES | NETIF_F_LLTX; dev->hw_features |= GRE6_FEATURES; - flags = nt->parms.o_flags; - /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ - if (flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags)) return; - if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE) + if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) && + nt->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1489,13 +1499,9 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1519,9 +1525,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1795,12 +1798,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_IFLAGS])); + gre_flags_to_tnl_flags(parms->i_flags, + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_OFLAGS])); + gre_flags_to_tnl_flags(parms->o_flags, + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1853,7 +1856,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1882,13 +1884,9 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) - goto cleanup_alloc_pcpu_stats; + return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) @@ -1910,9 +1908,6 @@ static int ip6erspan_tap_init(struct net_device *dev) cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); -cleanup_alloc_pcpu_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; return ret; } @@ -1923,7 +1918,6 @@ static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1937,6 +1931,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); @@ -2147,11 +2142,13 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; - __be16 o_flags = p->o_flags; + IP_TUNNEL_DECLARE_FLAGS(o_flags); + + ip_tunnel_flags_copy(o_flags, p->o_flags); if (p->erspan_ver == 1 || p->erspan_ver == 2) { if (!p->collect_md) - o_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) goto nla_put_failure; @@ -2237,6 +2234,7 @@ static void ip6erspan_tap_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 97b0788b31ba..27d8725445e3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -120,7 +120,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); - nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (unlikely(IS_ERR_OR_NULL(neigh))) { @@ -599,7 +599,7 @@ int ip6_forward(struct sk_buff *skb) * send a redirect. */ - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else @@ -856,7 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; bool mono_delivery_time = skb->mono_delivery_time; @@ -1063,7 +1063,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, return NULL; } - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, @@ -1118,7 +1118,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); - rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; + rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; @@ -1159,7 +1159,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * dst entry and replace it instead with the * dst entry of the nexthop router */ - rt = (struct rt6_info *) *dst; + rt = dst_rt6_info(*dst); rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); @@ -1423,7 +1423,7 @@ static int __ip6_append_data(struct sock *sk, int offset = 0; bool zc = false; u32 tskey = 0; - struct rt6_info *rt = (struct rt6_info *)cork->dst; + struct rt6_info *rt = dst_rt6_info(cork->dst); bool paged, hold_tskey, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; @@ -1877,7 +1877,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; - struct rt6_info *rt = (struct rt6_info *)cork->base.dst; + struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1949,7 +1949,7 @@ out: int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; err = ip6_local_out(net, skb->sk, skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e9cc315832cb..9dee0c127955 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -798,17 +798,15 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct ipv6hdr *ipv6h; int nh, err; - if ((!(tpi->flags & TUNNEL_CSUM) && - (tunnel->parms.i_flags & TUNNEL_CSUM)) || - ((tpi->flags & TUNNEL_CSUM) && - !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != + test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } - if (tunnel->parms.i_flags & TUNNEL_SEQ) { - if (!(tpi->flags & TUNNEL_SEQ) || + if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { + if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); @@ -946,7 +944,9 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { - tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; + + tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } @@ -1746,7 +1746,7 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) if (new_mtu > IP_MAX_MTU - dev->hard_header_len) return -EINVAL; } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); @@ -2146,7 +2146,7 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); - return tunnel->net; + return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 7f4f976aa24a..590737c27537 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -174,11 +174,6 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) } } -static void vti6_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); @@ -671,7 +666,8 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { - dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu); + WRITE_ONCE(dev->mtu, + clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } @@ -892,7 +888,6 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, - .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -908,8 +903,8 @@ static void vti6_dev_setup(struct net_device *dev) dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; - dev->priv_destructor = vti6_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); @@ -931,9 +926,6 @@ static inline int vti6_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index cb0ee81a068a..dd342e6ecf3f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2273,7 +2273,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, int err; struct mr_table *mrt; struct mfc6_cache *cache; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ae134634c323..d914b23256ce 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1722,7 +1722,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (IS_ERR(dst)) return; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d0dcbaca1994..5e1b50c6a44d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -62,7 +62,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; static int nf_ct_frag6_sysctl_register(struct net *net) @@ -105,7 +104,7 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ef2059c88955..88b3fcacd4f9 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,7 +154,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0d896ca7b589..2eedf255600b 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -598,7 +598,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct ipv6hdr *iph; struct sk_buff *skb; int err; - struct rt6_info *rt = (struct rt6_info *)*dstp; + struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; @@ -917,7 +917,7 @@ back_from_confirm: ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index acb4f119e11f..327caca64257 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -369,7 +369,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); @@ -436,7 +436,6 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -449,7 +448,6 @@ static struct ctl_table ip6_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) @@ -487,7 +485,7 @@ err_alloc: static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1f4b935a0e57..c43b0616742e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -226,7 +226,7 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { - const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst->dev, skb, daddr); @@ -234,8 +234,8 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst->dev; - struct rt6_info *rt = (struct rt6_info *)dst; daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -354,7 +354,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; @@ -373,7 +373,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; if (idev && idev->dev != blackhole_netdev) { @@ -1288,7 +1288,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) - return (struct rt6_info *) dst; + return dst_rt6_info(dst); dst_release(dst); @@ -2647,7 +2647,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; @@ -2661,7 +2661,7 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; @@ -2744,7 +2744,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, struct fib6_info *from; struct rt6_info *rt; - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; @@ -2772,7 +2772,7 @@ EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; + struct rt6_info *rt = dst_rt6_info(dst); if (rt) { if (rt->rt6i_flags & RTF_CACHE) { @@ -2796,7 +2796,7 @@ static void ip6_link_failure(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { @@ -2852,7 +2852,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. @@ -4174,7 +4174,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu } } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; @@ -5608,7 +5608,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - struct rt6_info *rt6 = (struct rt6_info *)dst; + struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; @@ -6111,7 +6111,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } - rt = container_of(dst, struct rt6_info, dst); + rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); @@ -6428,7 +6428,6 @@ static struct ctl_table ipv6_route_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) @@ -6452,10 +6451,6 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[1].procname = NULL; } return table; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 655c9b1a19b8..83b195f09561 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -132,8 +132,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, - struct ip_tunnel_parm *parms) +static struct ip_tunnel __rcu ** +__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -207,7 +207,7 @@ static int ipip6_tunnel_create(struct net_device *dev) __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); - if ((__force u16)t->parms.i_flags & SIT_ISATAP) + if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; @@ -226,7 +226,8 @@ out: } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, - struct ip_tunnel_parm *parms, int create) + struct ip_tunnel_parm_kern *parms, + int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; @@ -1135,7 +1136,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, +static void ipip6_tunnel_update(struct ip_tunnel *t, + struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; @@ -1196,11 +1198,11 @@ static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; - struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, data, sizeof(p))) + if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1251,7 +1253,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1268,7 +1270,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1281,7 +1283,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1297,7 +1299,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; @@ -1328,7 +1330,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); @@ -1348,7 +1350,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) } static int -ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, + int cmd) { switch (cmd) { case SIOCGETTUNNEL: @@ -1490,7 +1493,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, + struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1599,8 +1602,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD @@ -1687,7 +1690,7 @@ static size_t ipip6_get_size(const struct net_device *dev) static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_parm *parm = &tunnel->parms; + struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || @@ -1697,7 +1700,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || + nla_put_be16(skb, IFLA_IPTUN_FLAGS, + ip_tunnel_flags_to_be16(parm->i_flags)) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 6d8286c299c9..bfad1e89b6a6 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -246,7 +246,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } } - req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 888676163e90..c060285ff47f 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -213,7 +213,6 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, - { } }; static struct ctl_table ipv6_rotable[] = { @@ -248,11 +247,11 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ - { } }; static int __net_init ipv6_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; @@ -264,7 +263,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++) + for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); @@ -276,8 +275,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", - ipv6_table, - ARRAY_SIZE(ipv6_table_template)); + ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; @@ -313,9 +311,9 @@ out_ipv6_table: static void __net_exit ipv6_sysctl_net_exit(struct net *net) { - struct ctl_table *ipv6_table; - struct ctl_table *ipv6_route_table; - struct ctl_table *ipv6_icmp_table; + const struct ctl_table *ipv6_table; + const struct ctl_table *ipv6_route_table; + const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3f4cba49e9ee..7f6693e794bd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -60,6 +60,7 @@ #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> +#include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -69,7 +70,8 @@ #include <trace/events/tcp.h> -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -95,11 +97,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } @@ -793,7 +793,8 @@ clear_hash_nostart: static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, - struct sk_buff *skb) + struct sk_buff *skb, + u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); @@ -807,7 +808,7 @@ static void tcp_v6_init_req(struct request_sock *req, ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); - if (!TCP_SKB_CB(skb)->tcp_tw_isn && + if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || @@ -820,9 +821,10 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { - tcp_v6_init_req(req, sk, skb); + tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; @@ -1006,7 +1008,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 kfree_skb(buff); } -static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) +static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, + enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -1113,7 +1116,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { - trace_tcp_send_reset(sk, skb); if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); @@ -1129,6 +1131,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) label = ip6_flowlabel(ipv6h); } + trace_tcp_send_reset(sk, skb, reason); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); @@ -1674,7 +1678,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; reset: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); @@ -1738,7 +1742,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); - TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1755,6 +1758,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; struct sock *sk; int ret; + u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -1791,7 +1795,6 @@ lookup: if (!sk) goto no_tcp_socket; -process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; @@ -1860,7 +1863,10 @@ process: } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { - tcp_v6_send_reset(nsk, skb); + enum sk_rst_reason rst_reason; + + rst_reason = sk_rst_convert_drop_reason(drop_reason); + tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); @@ -1868,6 +1874,7 @@ process: } } +process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { @@ -1936,7 +1943,7 @@ csum_error: bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { - tcp_v6_send_reset(NULL, skb); + tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: @@ -1964,7 +1971,7 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; @@ -1982,6 +1989,7 @@ do_time_wait: sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; + __this_cpu_write(tcp_tw_isn, isn); goto process; } } @@ -1991,7 +1999,7 @@ do_time_wait: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: - tcp_v6_send_reset(sk, skb); + tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason)); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: @@ -2041,7 +2049,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, .twsk_destructor = tcp_twsk_destructor, }; @@ -2389,15 +2396,9 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 4b07d1e6c952..d59f58cbd306 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -7,24 +7,67 @@ */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> +#include <net/inet6_hashtables.h> #include <net/gro.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/ip6_checksum.h> #include "ip6_offload.h" +static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, + struct tcphdr *th) +{ +#if IS_ENABLED(CONFIG_IPV6) + const struct ipv6hdr *hdr; + struct sk_buff *p; + struct sock *sk; + struct net *net; + int iif, sdif; + + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) + return; + + p = tcp_gro_lookup(head, th); + if (p) { + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; + return; + } + + inet6_get_iif_sdif(skb, &iif, &sdif); + hdr = skb_gro_network_header(skb); + net = dev_net(skb->dev); + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + iif, sdif); + NAPI_GRO_CB(skb)->is_flist = !sk; + if (sk) + sock_put(sk); +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) { + struct tcphdr *th; + /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, - ip6_gro_compute_pseudo)) { - NAPI_GRO_CB(skb)->flush = 1; - return NULL; - } + ip6_gro_compute_pseudo)) + goto flush; - return tcp_gro_receive(head, skb); + th = tcp_gro_pull_header(skb); + if (!th) + goto flush; + + tcp6_check_fraglist_gro(head, skb, th); + + return tcp_gro_receive(head, skb, th); + +flush: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; } INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) @@ -32,6 +75,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) const struct ipv6hdr *iph = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + __skb_incr_checksum_unnecessary(skb); + + return 0; + } + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, &iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; @@ -40,6 +92,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) return 0; } +static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + __be16 *oldport, __be16 newport) +{ + struct tcphdr *th; + + if (*oldport == newport) + return; + + th = tcp_hdr(seg); + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); + *oldport = newport; +} + +static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct tcphdr *th; + const struct ipv6hdr *iph; + struct sk_buff *seg; + struct tcphdr *th2; + struct ipv6hdr *iph2; + + seg = segs; + th = tcp_hdr(seg); + iph = ipv6_hdr(seg); + th2 = tcp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + th2 = tcp_hdr(seg); + iph2 = ipv6_hdr(seg); + + iph2->saddr = iph->saddr; + iph2->daddr = iph->daddr; + __tcpv6_gso_segment_csum(seg, &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); + } + + return segs; +} + +static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + return __tcpv6_gso_segment_list_csum(skb); +} + static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -51,6 +158,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) + return __tcp6_gso_segment_list(skb, features); + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8f7aa8bac1e7..c81a07ac0463 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/indirect_call_wrapper.h> +#include <trace/events/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> @@ -168,15 +169,21 @@ static struct sock *udp6_lib_lookup2(struct net *net, { struct sock *sk, *result; int score, badness; + bool need_rescore; result = NULL; badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + need_rescore = false; +rescore: + score = compute_score(need_rescore ? result : sk, net, saddr, + sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; + if (need_rescore) + continue; + if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; @@ -197,8 +204,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, if (IS_ERR(result)) continue; - badness = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif); + /* compute_score is too long of a function to be + * inlined, and calling it again here yields + * measureable overhead for some + * workloads. Work around it by jumping + * backwards to rescore 'result'. + */ + need_rescore = true; + goto rescore; } } return result; @@ -659,8 +672,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } @@ -898,11 +911,8 @@ start_lookup: static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { - if (udp_sk_rx_dst_set(sk, dst)) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - - sk->sk_rx_dst_cookie = rt6_get_cookie(rt); - } + if (udp_sk_rx_dst_set(sk, dst)) + sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and @@ -1573,7 +1583,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, - (struct rt6_info *)dst, + dst_rt6_info(dst), msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) @@ -1600,7 +1610,7 @@ do_append_data: ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), - &ipc6, fl6, (struct rt6_info *)dst, + &ipc6, fl6, dst_rt6_info(dst), corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index c6b8e132e10a..4abc5e9d6322 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -113,19 +113,6 @@ static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull /* Must be an IKE packet.. pass it through */ return 1; break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - /* Check if this is a keepalive packet. If so, eat it. */ - if (len == 1 && udpdata[0] == 0xff) { - return -EINVAL; - } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && - udpdata32[0] == 0 && udpdata32[1] == 0) { - - /* ESP Packet with Non-IKE marker */ - len = sizeof(struct udphdr) + 2 * sizeof(u32); - } else - /* Must be an IKE packet.. pass it through */ - return 1; - break; } /* At this point we are sure that this is an ESPinUDP packet, @@ -283,6 +270,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, if (!x) continue; + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_state_put(x); + x = NULL; + continue; + } + spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 42fb6996b077..cc885d3aa9e5 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -70,7 +70,7 @@ static int xfrm6_get_saddr(struct net *net, int oif, static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { - struct rt6_info *rt = (struct rt6_info *)xdst->route; + struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); @@ -184,7 +184,6 @@ static struct ctl_table xfrm6_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) @@ -218,7 +217,7 @@ err_alloc: static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7c8c3adcac6e..c951bb9cc2e0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index a4ab615ca3e3..5e37a8ceebcb 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -520,7 +520,7 @@ static void iucv_setmask_mp(void) */ static void iucv_setmask_up(void) { - cpumask_t cpumask; + static cpumask_t cpumask; int cpu; /* Disable all cpu but the first in cpu_irq_cpumask. */ @@ -628,23 +628,33 @@ static int iucv_cpu_online(unsigned int cpu) static int iucv_cpu_down_prep(unsigned int cpu) { - cpumask_t cpumask; + cpumask_var_t cpumask; + int ret = 0; if (!iucv_path_table) return 0; - cpumask_copy(&cpumask, &iucv_buffer_cpumask); - cpumask_clear_cpu(cpu, &cpumask); - if (cpumask_empty(&cpumask)) + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(cpumask, &iucv_buffer_cpumask); + cpumask_clear_cpu(cpu, cpumask); + if (cpumask_empty(cpumask)) { /* Can't offline last IUCV enabled cpu. */ - return -EINVAL; + ret = -EINVAL; + goto __free_cpumask; + } iucv_retrieve_cpu(NULL); if (!cpumask_empty(&iucv_irq_cpumask)) - return 0; + goto __free_cpumask; + smp_call_function_single(cpumask_first(&iucv_buffer_cpumask), iucv_allow_cpu, NULL, 1); - return 0; + +__free_cpumask: + free_cpumask_var(cpumask); + return ret; } /** diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 8d21ff25f160..2ab45e3f48bf 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -794,6 +794,7 @@ static void l2tp_session_queue_purge(struct l2tp_session *session) static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) { struct l2tp_session *session = NULL; + struct l2tp_tunnel *orig_tunnel = tunnel; unsigned char *ptr, *optr; u16 hdrflags; u32 tunnel_id, session_id; @@ -845,6 +846,20 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) /* Extract tunnel and session ID */ tunnel_id = ntohs(*(__be16 *)ptr); ptr += 2; + + if (tunnel_id != tunnel->tunnel_id) { + /* We are receiving trafic for another tunnel, probably + * because we have several tunnels between the same + * IP/port quadruple, look it up. + */ + struct l2tp_tunnel *alt_tunnel; + + alt_tunnel = l2tp_tunnel_get(tunnel->l2tp_net, tunnel_id); + if (!alt_tunnel || alt_tunnel->version != L2TP_HDR_VER_2) + goto pass; + tunnel = alt_tunnel; + } + session_id = ntohs(*(__be16 *)ptr); ptr += 2; } else { @@ -875,6 +890,9 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); + if (tunnel != orig_tunnel) + l2tp_tunnel_dec_refcount(tunnel); + return 0; invalid: @@ -884,6 +902,9 @@ pass: /* Put UDP header back */ __skb_push(skb, sizeof(struct udphdr)); + if (tunnel != orig_tunnel) + l2tp_tunnel_dec_refcount(tunnel); + return 1; } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 970af3983d11..19c8cc5289d5 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -459,7 +459,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &inet->cork.fl.u.ip4; if (connected) - rt = (struct rtable *)__sk_dst_check(sk, 0); + rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 7bf14cf9ffaa..8780ec64f376 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -630,7 +630,7 @@ back_from_confirm: ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, - &fl6, (struct rt6_info *)dst, + &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 8443a6d841b0..72e101135f8c 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -44,11 +44,6 @@ static struct ctl_table llc2_timeout_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, -}; - -static struct ctl_table llc_station_table[] = { - { }, }; static struct ctl_table_header *llc2_timeout_header; @@ -56,8 +51,9 @@ static struct ctl_table_header *llc_station_header; int __init llc_sysctl_init(void) { + struct ctl_table empty[1] = {}; llc2_timeout_header = register_net_sysctl(&init_net, "net/llc/llc2/timeout", llc2_timeout_table); - llc_station_header = register_net_sysctl(&init_net, "net/llc/station", llc_station_table); + llc_station_header = register_net_sysctl_sz(&init_net, "net/llc/station", empty, 0); if (!llc2_timeout_header || !llc_station_header) { llc_sysctl_exit(); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f67c1d021812..b08e5d7687e3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1486,7 +1486,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(link->u.ap.beacon, NULL); - sdata->u.ap.active = false; + + if (ieee80211_num_beaconing_links(sdata) == 0) + sdata->u.ap.active = false; + goto error; } @@ -1607,10 +1610,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, /* abort any running channel switch or color change */ link_conf->csa_active = false; link_conf->color_change_active = false; - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } ieee80211_free_next_beacon(link); @@ -1619,11 +1622,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); - if (ieee80211_num_beaconing_links(sdata) <= 1) + if (ieee80211_num_beaconing_links(sdata) <= 1) { netif_carrier_off(dev); + sdata->u.ap.active = false; + } /* remove beacon and probe response */ - sdata->u.ap.active = false; RCU_INIT_POINTER(link->u.ap.beacon, NULL); RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); @@ -3648,7 +3652,7 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_t struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; - sdata->csa_blocked_tx = block_tx; + sdata->csa_blocked_queues = block_tx; sdata_info(sdata, "channel switch failed, disconnecting\n"); wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work); } @@ -3734,10 +3738,10 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) ieee80211_link_info_change_notify(sdata, link_data, changed); - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } err = drv_post_channel_switch(link_data); @@ -3914,13 +3918,13 @@ static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data, return 0; } -static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) +static void ieee80211_color_change_abort(struct ieee80211_link_data *link) { - sdata->vif.bss_conf.color_change_active = false; + link->conf->color_change_active = false; - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); - cfg80211_color_change_aborted_notify(sdata->dev); + cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id); } static int @@ -4004,7 +4008,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, /* if there is a color change in progress, abort it */ if (link_conf->color_change_active) - ieee80211_color_change_abort(sdata); + ieee80211_color_change_abort(link_data); err = ieee80211_set_csa_beacon(link_data, params, &changed); if (err) { @@ -4019,7 +4023,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) { ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = true; + sdata->csa_blocked_queues = true; } cfg80211_ch_switch_started_notify(sdata->dev, @@ -4662,20 +4666,22 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy, } static int -ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, +ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; + switch (sdata->vif.type) { case NL80211_IFTYPE_AP: { int ret; - if (!sdata->deflink.u.ap.next_beacon) + if (!link->u.ap.next_beacon) return -EINVAL; - ret = ieee80211_assign_beacon(sdata, &sdata->deflink, - sdata->deflink.u.ap.next_beacon, + ret = ieee80211_assign_beacon(sdata, link, + link->u.ap.next_beacon, NULL, NULL, changed); - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); if (ret < 0) return ret; @@ -4691,18 +4697,19 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, } static int -ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, +ieee80211_set_color_change_beacon(struct ieee80211_link_data *link, struct cfg80211_color_change_settings *params, u64 *changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_color_change_settings color_change = {}; int err; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: - sdata->deflink.u.ap.next_beacon = + link->u.ap.next_beacon = cfg80211_beacon_dup(¶ms->beacon_next); - if (!sdata->deflink.u.ap.next_beacon) + if (!link->u.ap.next_beacon) return -ENOMEM; if (params->count <= 1) @@ -4714,11 +4721,11 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, params->counter_offset_presp; color_change.count = params->count; - err = ieee80211_assign_beacon(sdata, &sdata->deflink, + err = ieee80211_assign_beacon(sdata, link, ¶ms->beacon_color_change, NULL, &color_change, changed); if (err < 0) { - ieee80211_free_next_beacon(&sdata->deflink); + ieee80211_free_next_beacon(link); return err; } break; @@ -4730,16 +4737,18 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, } static void -ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, +ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, u8 color, int enable, u64 changed) { + struct ieee80211_sub_if_data *sdata = link->sdata; + lockdep_assert_wiphy(sdata->local->hw.wiphy); - sdata->vif.bss_conf.he_bss_color.color = color; - sdata->vif.bss_conf.he_bss_color.enabled = enable; + link->conf->he_bss_color.color = color; + link->conf->he_bss_color.enabled = enable; changed |= BSS_CHANGED_HE_BSS_COLOR; - ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); + ieee80211_link_info_change_notify(sdata, link, changed); if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { struct ieee80211_sub_if_data *child; @@ -4756,26 +4765,27 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, } } -static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) +static int ieee80211_color_change_finalize(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); - sdata->vif.bss_conf.color_change_active = false; + link->conf->color_change_active = false; - err = ieee80211_set_after_color_change_beacon(sdata, &changed); + err = ieee80211_set_after_color_change_beacon(link, &changed); if (err) { - cfg80211_color_change_aborted_notify(sdata->dev); + cfg80211_color_change_aborted_notify(sdata->dev, link->link_id); return err; } - ieee80211_color_change_bss_config_notify(sdata, - sdata->vif.bss_conf.color_change_color, + ieee80211_color_change_bss_config_notify(link, + link->conf->color_change_color, 1, changed); - cfg80211_color_change_notify(sdata->dev); + cfg80211_color_change_notify(sdata->dev, link->link_id); return 0; } @@ -4783,21 +4793,23 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) void ieee80211_color_change_finalize_work(struct wiphy *wiphy, struct wiphy_work *work) { - struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, - deflink.color_change_finalize_work); + struct ieee80211_link_data *link = + container_of(work, struct ieee80211_link_data, + color_change_finalize_work); + struct ieee80211_sub_if_data *sdata = link->sdata; + struct ieee80211_bss_conf *link_conf = link->conf; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); /* AP might have been stopped while waiting for the lock. */ - if (!sdata->vif.bss_conf.color_change_active) + if (!link_conf->color_change_active) return; if (!ieee80211_sdata_running(sdata)) return; - ieee80211_color_change_finalize(sdata); + ieee80211_color_change_finalize(link); } void ieee80211_color_collision_detection_work(struct work_struct *work) @@ -4808,30 +4820,60 @@ void ieee80211_color_collision_detection_work(struct work_struct *work) color_collision_detect_work); struct ieee80211_sub_if_data *sdata = link->sdata; - cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap); + cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, + link->link_id); } -void ieee80211_color_change_finish(struct ieee80211_vif *vif) +void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link; + + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return; + + rcu_read_lock(); + + link = rcu_dereference(sdata->link[link_id]); + if (WARN_ON(!link)) { + rcu_read_unlock(); + return; + } wiphy_work_queue(sdata->local->hw.wiphy, - &sdata->deflink.color_change_finalize_work); + &link->color_change_finalize_work); + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); void ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, - u64 color_bitmap) + u64 color_bitmap, u8 link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_link_data *link = &sdata->deflink; + struct ieee80211_link_data *link; - if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) return; - if (delayed_work_pending(&link->color_collision_detect_work)) + rcu_read_lock(); + + link = rcu_dereference(sdata->link[link_id]); + if (WARN_ON(!link)) { + rcu_read_unlock(); return; + } + + if (link->conf->color_change_active || link->conf->csa_active) { + rcu_read_unlock(); + return; + } + + if (delayed_work_pending(&link->color_collision_detect_work)) { + rcu_read_unlock(); + return; + } link->color_bitmap = color_bitmap; /* queue the color collision detection event every 500 ms in order to @@ -4840,6 +4882,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif, ieee80211_queue_delayed_work(&sdata->local->hw, &link->color_collision_detect_work, msecs_to_jiffies(500)); + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify); @@ -4849,36 +4893,48 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *link_conf; + struct ieee80211_link_data *link; + u8 link_id = params->link_id; u64 changed = 0; int err; lockdep_assert_wiphy(local->hw.wiphy); - if (sdata->vif.bss_conf.nontransmitted) + if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) + return -EINVAL; + + link = wiphy_dereference(wiphy, sdata->link[link_id]); + if (!link) + return -ENOLINK; + + link_conf = link->conf; + + if (link_conf->nontransmitted) return -EINVAL; /* don't allow another color change if one is already active or if csa * is active */ - if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) { + if (link_conf->color_change_active || link_conf->csa_active) { err = -EBUSY; goto out; } - err = ieee80211_set_color_change_beacon(sdata, params, &changed); + err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; - sdata->vif.bss_conf.color_change_active = true; - sdata->vif.bss_conf.color_change_color = params->color; + link_conf->color_change_active = true; + link_conf->color_change_color = params->color; - cfg80211_color_change_started_notify(sdata->dev, params->count); + cfg80211_color_change_started_notify(sdata->dev, params->count, link_id); if (changed) - ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed); + ieee80211_color_change_bss_config_notify(link, 0, 0, changed); else /* if the beacon didn't change, we can finalize immediately */ - ieee80211_color_change_finalize(sdata); + ieee80211_color_change_finalize(link); out: diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index ccacaed32817..380695fdc32f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -547,8 +547,10 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, _ieee80211_change_chanctx(local, ctx, old_ctx, chanreq, NULL); } +/* Note: if successful, the returned chanctx is reserved for the link */ static struct ieee80211_chanctx * ieee80211_find_chanctx(struct ieee80211_local *local, + struct ieee80211_link_data *link, const struct ieee80211_chan_req *chanreq, enum ieee80211_chanctx_mode mode) { @@ -560,6 +562,9 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (mode == IEEE80211_CHANCTX_EXCLUSIVE) return NULL; + if (WARN_ON(link->reserved_chanctx)) + return NULL; + list_for_each_entry(ctx, &local->chanctx_list, list) { const struct ieee80211_chan_req *compat; @@ -578,6 +583,16 @@ ieee80211_find_chanctx(struct ieee80211_local *local, if (!compat) continue; + /* + * Reserve the chanctx temporarily, as the driver might change + * active links during callbacks we make into it below and/or + * later during assignment, which could (otherwise) cause the + * context to actually be removed. + */ + link->reserved_chanctx = ctx; + list_add(&link->reserved_chanctx_list, + &ctx->reserved_links); + ieee80211_change_chanctx(local, ctx, ctx, compat); return ctx; @@ -673,7 +688,8 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, static struct ieee80211_chanctx * ieee80211_new_chanctx(struct ieee80211_local *local, const struct ieee80211_chan_req *chanreq, - enum ieee80211_chanctx_mode mode) + enum ieee80211_chanctx_mode mode, + bool assign_on_failure) { struct ieee80211_chanctx *ctx; int err; @@ -685,36 +701,41 @@ ieee80211_new_chanctx(struct ieee80211_local *local, return ERR_PTR(-ENOMEM); err = ieee80211_add_chanctx(local, ctx); - if (err) { + if (!assign_on_failure && err) { kfree(ctx); return ERR_PTR(err); } + /* We ignored a driver error, see _ieee80211_set_active_links */ + WARN_ON_ONCE(err && !local->in_reconfig); list_add_rcu(&ctx->list, &local->chanctx_list); return ctx; } static void ieee80211_del_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool skip_idle_recalc) { lockdep_assert_wiphy(local->hw.wiphy); drv_remove_chanctx(local, ctx); - ieee80211_recalc_idle(local); + if (!skip_idle_recalc) + ieee80211_recalc_idle(local); ieee80211_remove_wbrf(local, &ctx->conf.def); } static void ieee80211_free_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + bool skip_idle_recalc) { lockdep_assert_wiphy(local->hw.wiphy); WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0); list_del_rcu(&ctx->list); - ieee80211_del_chanctx(local, ctx); + ieee80211_del_chanctx(local, ctx, skip_idle_recalc); kfree_rcu(ctx, rcu_head); } @@ -791,14 +812,15 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local, } static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, - struct ieee80211_chanctx *new_ctx) + struct ieee80211_chanctx *new_ctx, + bool assign_on_failure) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *curr_ctx = NULL; bool new_idle; - int ret = 0; + int ret; if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) return -EOPNOTSUPP; @@ -819,15 +841,20 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, ieee80211_recalc_chanctx_min_def(local, new_ctx, link); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); - if (ret) - goto out; - - conf = &new_ctx->conf; - list_add(&link->assigned_chanctx_list, - &new_ctx->assigned_links); + if (assign_on_failure || !ret) { + /* Need to continue, see _ieee80211_set_active_links */ + WARN_ON_ONCE(ret && !local->in_reconfig); + ret = 0; + + /* succeeded, so commit it to the data structures */ + conf = &new_ctx->conf; + list_add(&link->assigned_chanctx_list, + &new_ctx->assigned_links); + } + } else { + ret = 0; } -out: rcu_assign_pointer(link->conf->chanctx_conf, conf); if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) { @@ -1019,7 +1046,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) list_del_rcu(&ctx->list); kfree_rcu(ctx, rcu_head); } else { - ieee80211_free_chanctx(sdata->local, ctx); + ieee80211_free_chanctx(sdata->local, ctx, false); } } @@ -1044,7 +1071,8 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, new_ctx = ieee80211_find_reservation_chanctx(local, chanreq, mode); if (!new_ctx) { if (ieee80211_can_create_new_chanctx(local)) { - new_ctx = ieee80211_new_chanctx(local, chanreq, mode); + new_ctx = ieee80211_new_chanctx(local, chanreq, mode, + false); if (IS_ERR(new_ctx)) return PTR_ERR(new_ctx); } else { @@ -1235,7 +1263,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) CHANCTX_SWMODE_REASSIGN_VIF); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) - ieee80211_free_chanctx(local, new_ctx); + ieee80211_free_chanctx(local, new_ctx, false); goto out; } @@ -1249,7 +1277,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) ieee80211_check_fast_xmit_iface(sdata); if (ieee80211_chanctx_refcount(local, old_ctx) == 0) - ieee80211_free_chanctx(local, old_ctx); + ieee80211_free_chanctx(local, old_ctx, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); ieee80211_recalc_smps_chanctx(local, new_ctx); @@ -1300,10 +1328,10 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; - err = ieee80211_assign_link_chanctx(link, new_ctx); + err = ieee80211_assign_link_chanctx(link, new_ctx, false); if (err) { if (ieee80211_chanctx_refcount(local, new_ctx) == 0) - ieee80211_free_chanctx(local, new_ctx); + ieee80211_free_chanctx(local, new_ctx, false); goto out; } @@ -1400,7 +1428,7 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; - ieee80211_del_chanctx(local, ctx->replace_ctx); + ieee80211_del_chanctx(local, ctx->replace_ctx, false); err = ieee80211_add_chanctx(local, ctx); if (err) goto err; @@ -1417,7 +1445,7 @@ err: if (!list_empty(&ctx->replace_ctx->assigned_links)) continue; - ieee80211_del_chanctx(local, ctx); + ieee80211_del_chanctx(local, ctx, false); WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx)); } @@ -1669,7 +1697,8 @@ err: return err; } -static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) +void __ieee80211_link_release_channel(struct ieee80211_link_data *link, + bool skip_idle_recalc) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *link_conf = link->conf; @@ -1695,9 +1724,9 @@ static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) ieee80211_link_unreserve_chanctx(link); } - ieee80211_assign_link_chanctx(link, NULL); + ieee80211_assign_link_chanctx(link, NULL, false); if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); + ieee80211_free_chanctx(local, ctx, skip_idle_recalc); link->radar_required = false; @@ -1706,14 +1735,16 @@ static void __ieee80211_link_release_channel(struct ieee80211_link_data *link) ieee80211_vif_use_reserved_switch(local); } -int ieee80211_link_use_channel(struct ieee80211_link_data *link, - const struct ieee80211_chan_req *chanreq, - enum ieee80211_chanctx_mode mode) +int _ieee80211_link_use_channel(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *chanreq, + enum ieee80211_chanctx_mode mode, + bool assign_on_failure) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx *ctx; u8 radar_detect_width = 0; + bool reserved = false; int ret; lockdep_assert_wiphy(local->hw.wiphy); @@ -1738,11 +1769,15 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link, if (ret < 0) goto out; - __ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, false); - ctx = ieee80211_find_chanctx(local, chanreq, mode); - if (!ctx) - ctx = ieee80211_new_chanctx(local, chanreq, mode); + ctx = ieee80211_find_chanctx(local, link, chanreq, mode); + /* Note: context is now reserved */ + if (ctx) + reserved = true; + else + ctx = ieee80211_new_chanctx(local, chanreq, mode, + assign_on_failure); if (IS_ERR(ctx)) { ret = PTR_ERR(ctx); goto out; @@ -1750,11 +1785,19 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link, ieee80211_link_update_chanreq(link, chanreq); - ret = ieee80211_assign_link_chanctx(link, ctx); + ret = ieee80211_assign_link_chanctx(link, ctx, assign_on_failure); + + if (reserved) { + /* remove reservation */ + WARN_ON(link->reserved_chanctx != ctx); + link->reserved_chanctx = NULL; + list_del(&link->reserved_chanctx_list); + } + if (ret) { /* if assign fails refcount stays the same */ if (ieee80211_chanctx_refcount(local, ctx) == 0) - ieee80211_free_chanctx(local, ctx); + ieee80211_free_chanctx(local, ctx, false); goto out; } @@ -1947,7 +1990,7 @@ void ieee80211_link_release_channel(struct ieee80211_link_data *link) lockdep_assert_wiphy(sdata->local->hw.wiphy); if (rcu_access_pointer(link->conf->chanctx_conf)) - __ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, false); } void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2f68e92a7404..98310188f330 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -498,6 +498,7 @@ static const char *hw_flag_names[] = { FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), + FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), #undef FLAG }; diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index 12a6f0e9eca6..59e3ec4dc960 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -2,7 +2,7 @@ /* * mac80211 drop reason list * - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation */ #ifndef MAC80211_DROP_H @@ -66,6 +66,7 @@ typedef unsigned int __bitwise ieee80211_rx_result; R(RX_DROP_U_UNEXPECTED_STA_4ADDR) \ R(RX_DROP_U_UNEXPECTED_VLAN_MCAST) \ R(RX_DROP_U_NOT_PORT_CONTROL) \ + R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ /* this line for the trailing \ - add before this */ /* having two enums allows for checking ieee80211_rx_result use with sparse */ diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c3330aea4da3..d7e8cf8e48b7 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -580,7 +580,7 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, /* we'll do more on status of this frame */ info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; - /* we have 12 bits, and need 6: link_id 4, smps 2 */ + /* we have 13 bits, and need 6: link_id 4, smps 2 */ info->status_data = IEEE80211_STATUS_TYPE_SMPS | u16_encode_bits(status_link_id << 2 | smps, IEEE80211_STATUS_SUBDATA_MASK); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index bd507d6b65e3..eb62b7d4b4f7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -89,7 +89,8 @@ enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, IEEE80211_STATUS_TYPE_SMPS = 1, - IEEE80211_STATUS_SUBDATA_MASK = 0xff0, + IEEE80211_STATUS_TYPE_NEG_TTLM = 2, + IEEE80211_STATUS_SUBDATA_MASK = 0x1ff0, }; static inline bool @@ -595,6 +596,7 @@ struct ieee80211_if_managed { /* TID-to-link mapping support */ struct wiphy_delayed_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; + struct wiphy_work teardown_ttlm_work; /* dialog token enumerator for neg TTLM request */ u8 dialog_token_alloc; @@ -684,7 +686,7 @@ struct mesh_csa_settings { }; /** - * struct mesh_table + * struct mesh_table - mesh hash table * * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. @@ -974,6 +976,7 @@ struct ieee80211_link_data_managed { bool csa_waiting_bcn; bool csa_ignored_same_chan; + bool csa_blocked_tx; struct wiphy_delayed_work chswitch_work; struct wiphy_work request_smps_work; @@ -1092,7 +1095,7 @@ struct ieee80211_sub_if_data { unsigned long state; - bool csa_blocked_tx; + bool csa_blocked_queues; char name[IFNAMSIZ]; @@ -1159,6 +1162,8 @@ struct ieee80211_sub_if_data { struct wiphy_work activate_links_work; u16 desired_active_links; + u16 restart_active_links; + #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *subdir_stations; @@ -2549,9 +2554,19 @@ bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a, const struct ieee80211_chan_req *b); int __must_check +_ieee80211_link_use_channel(struct ieee80211_link_data *link, + const struct ieee80211_chan_req *req, + enum ieee80211_chanctx_mode mode, + bool assign_on_failure); + +static inline int __must_check ieee80211_link_use_channel(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, - enum ieee80211_chanctx_mode mode); + enum ieee80211_chanctx_mode mode) +{ + return _ieee80211_link_use_channel(link, req, mode, false); +} + int __must_check ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, @@ -2565,6 +2580,8 @@ int __must_check ieee80211_link_change_chanreq(struct ieee80211_link_data *link, const struct ieee80211_chan_req *req, u64 *changed); +void __ieee80211_link_release_channel(struct ieee80211_link_data *link, + bool skip_idle_recalc); void ieee80211_link_release_channel(struct ieee80211_link_data *link); void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link); void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 395de62d9cb2..dc42902e2693 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -544,10 +544,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do sdata->vif.bss_conf.csa_active = false; if (sdata->vif.type == NL80211_IFTYPE_STATION) sdata->deflink.u.mgd.csa_waiting_bcn = false; - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa_finalize_work); @@ -1699,8 +1699,13 @@ static void ieee80211_activate_links_work(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, activate_links_work); + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (local->in_reconfig) + return; ieee80211_set_active_links(&sdata->vif, sdata->desired_active_links); + sdata->desired_active_links = 0; } /* diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 685ec66b4264..af0321408a97 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -358,7 +358,7 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, ieee80211_teardown_tdls_peers(link); - ieee80211_link_release_channel(link); + __ieee80211_link_release_channel(link, true); } list_for_each_entry(sta, &local->sta_list, list) { @@ -404,9 +404,24 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); - ret = ieee80211_link_use_channel(link, - &link->conf->chanreq, - IEEE80211_CHANCTX_SHARED); + /* + * This call really should not fail. Unfortunately, it appears + * that this may happen occasionally with some drivers. Should + * it happen, we are stuck in a bad place as going backwards is + * not really feasible. + * + * So lets just tell link_use_channel that it must not fail to + * assign the channel context (from mac80211's perspective) and + * assume the driver is going to trigger a recovery flow if it + * had a failure. + * That really is not great nor guaranteed to work. But at least + * the internal mac80211 state remains consistent and there is + * a chance that we can recover. + */ + ret = _ieee80211_link_use_channel(link, + &link->conf->chanreq, + IEEE80211_CHANCTX_SHARED, + true); WARN_ON_ONCE(ret); ieee80211_mgd_set_link_qos_params(link); @@ -450,10 +465,13 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links) if (WARN_ON(!active_links)) return -EINVAL; + old_active = sdata->vif.active_links; + if (old_active == active_links) + return 0; + if (!drv_can_activate_links(local, sdata, active_links)) return -EINVAL; - old_active = sdata->vif.active_links; if (old_active & active_links) { /* * if there's at least one link that stays active across diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3bbb216a0fc8..a5f2d3cfe60d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -599,6 +599,10 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; + if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ && + ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ)) + return false; + return true; } @@ -1933,13 +1937,14 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) WARN_ON(!link->conf->csa_active); - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } link->conf->csa_active = false; + link->u.mgd.csa_blocked_tx = false; link->u.mgd.csa_waiting_bcn = false; ret = drv_post_channel_switch(link); @@ -1999,13 +2004,14 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) ieee80211_link_unreserve_chanctx(link); - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } link->conf->csa_active = false; + link->u.mgd.csa_blocked_tx = false; drv_abort_channel_switch(link); } @@ -2165,12 +2171,13 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, link->csa_chanreq = csa_ie.chanreq; link->u.mgd.csa_ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; + link->u.mgd.csa_blocked_tx = csa_ie.mode; if (csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) { ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = true; + sdata->csa_blocked_queues = true; } cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper, @@ -2199,7 +2206,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, * reset when the disconnection worker runs. */ link->conf->csa_active = true; - sdata->csa_blocked_tx = + link->u.mgd.csa_blocked_tx = csa_ie.mode; + sdata->csa_blocked_queues = csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA); wiphy_work_queue(sdata->local->hw.wiphy, @@ -3252,12 +3260,13 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, } sdata->vif.bss_conf.csa_active = false; + sdata->deflink.u.mgd.csa_blocked_tx = false; sdata->deflink.u.mgd.csa_waiting_bcn = false; sdata->deflink.u.mgd.csa_ignored_same_chan = false; - if (sdata->csa_blocked_tx) { + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } /* existing TX TSPEC sessions no longer exist */ @@ -3563,19 +3572,32 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; - bool tx; + bool tx = false; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; - /* - * MLO drivers should have HANDLES_QUIET_CSA, so that csa_blocked_tx - * is always false; if they don't then this may try to transmit the - * frame but queues will be stopped. - */ - tx = !sdata->csa_blocked_tx; + /* only transmit if we have a link that makes that worthwhile */ + for (unsigned int link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link; + + if (!ieee80211_vif_link_active(&sdata->vif, link_id)) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (WARN_ON_ONCE(!link)) + continue; + + if (link->u.mgd.csa_blocked_tx) + continue; + + tx = true; + break; + } if (!ifmgd->driver_disconnect) { unsigned int link_id; @@ -3608,10 +3630,11 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa_waiting_bcn = false; - if (sdata->csa_blocked_tx) { + sdata->deflink.u.mgd.csa_blocked_tx = false; + if (sdata->csa_blocked_queues) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); - sdata->csa_blocked_tx = false; + sdata->csa_blocked_queues = false; } ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx, @@ -4436,9 +4459,11 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: bss_conf->power_type = IEEE80211_REG_LPI_AP; break; case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: bss_conf->power_type = IEEE80211_REG_SP_AP; break; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: @@ -6803,6 +6828,60 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, __ieee80211_disconnect(sdata); } +static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, + struct wiphy_work *work) +{ + u16 new_dormant_links; + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.neg_ttlm_timeout_work.work); + + if (!sdata->vif.neg_ttlm.valid) + return; + + memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); + new_dormant_links = + sdata->vif.dormant_links & ~sdata->vif.suspended_links; + sdata->vif.suspended_links = 0; + ieee80211_vif_set_links(sdata, sdata->vif.valid_links, + new_dormant_links); + ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM | + BSS_CHANGED_MLD_VALID_LINKS); +} + +void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int frame_len = offsetofend(struct ieee80211_mgmt, + u.action.u.ttlm_tear_down); + struct ieee80211_tx_info *info; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, frame_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.ttlm_tear_down.action_code = + WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; + + info = IEEE80211_SKB_CB(skb); + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM; + ieee80211_tx_skb(sdata, skb); +} +EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm); + void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { @@ -7434,6 +7513,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); + wiphy_work_init(&ifmgd->teardown_ttlm_work, + ieee80211_teardown_ttlm_work); ifmgd->flags = 0; ifmgd->powersave = sdata->wdev.ps; @@ -8220,6 +8301,14 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (req->ap_mld_addr) { uapsd_supported = true; + if (req->flags & (ASSOC_REQ_DISABLE_HT | + ASSOC_REQ_DISABLE_VHT | + ASSOC_REQ_DISABLE_HE | + ASSOC_REQ_DISABLE_EHT)) { + err = -EINVAL; + goto err_free; + } + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { struct ieee80211_supported_band *sband; struct cfg80211_bss *link_cbss = req->links[i].bss; @@ -8232,19 +8321,13 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (!bss->wmm_used) { err = -EINVAL; - goto err_free; - } - - if (req->flags & (ASSOC_REQ_DISABLE_HT | - ASSOC_REQ_DISABLE_VHT | - ASSOC_REQ_DISABLE_HE | - ASSOC_REQ_DISABLE_EHT)) { - err = -EINVAL; + req->links[i].error = err; goto err_free; } if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { err = -EINVAL; + req->links[i].error = err; goto err_free; } @@ -8621,6 +8704,8 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) &ifmgd->beacon_connection_loss_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); + wiphy_work_cancel(sdata->local->hw.wiphy, + &ifmgd->teardown_ttlm_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->tdls_peer_del_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 221695d841fd..65e1e9e971fd 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -897,8 +897,18 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, break; } - if (ether_addr_equal(conf->addr, mgmt->sa)) + if (ether_addr_equal(conf->addr, mgmt->sa)) { + /* If userspace requested Tx on a specific link + * use the same link id if the link bss is matching + * the requested chan. + */ + if (sdata->vif.valid_links && + params->link_id >= 0 && params->link_id == i && + params->chan == chanctx_conf->def.chan) + link_id = i; + break; + } chanctx_conf = NULL; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6e24864f9a40..4914692750e5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3368,7 +3368,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) if (ieee80211_hw_check(&rx->local->hw, DETECTS_COLOR_COLLISION)) return; - if (rx->sdata->vif.bss_conf.csa_active) + if (rx->link->conf->csa_active) return; baselen = mgmt->u.beacon.variable - rx->skb->data; @@ -3380,7 +3380,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) rx->skb->len - baselen); if (ie && ie->datalen >= sizeof(struct ieee80211_he_operation) && ie->datalen >= ieee80211_he_oper_size(ie->data + 1)) { - struct ieee80211_bss_conf *bss_conf = &rx->sdata->vif.bss_conf; + struct ieee80211_bss_conf *bss_conf = rx->link->conf; const struct ieee80211_he_operation *he_oper; u8 color; @@ -3393,7 +3393,8 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx) IEEE80211_HE_OPERATION_BSS_COLOR_MASK); if (color == bss_conf->he_bss_color.color) ieee80211_obss_color_collision_notify(&rx->sdata->vif, - BIT_ULL(color)); + BIT_ULL(color), + bss_conf->link_id); } } @@ -3969,8 +3970,8 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, -1, status->band); } - dev_kfree_skb(rx->skb); - return RX_QUEUED; + + return RX_DROP_U_UNKNOWN_ACTION_REJECTED; } static ieee80211_rx_result debug_noinline diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 73850312580f..3da1c5c45035 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -708,19 +708,11 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, return -EBUSY; /* For an MLO connection, if a link ID was specified, validate that it - * is indeed active. If no link ID was specified, select one of the - * active links. + * is indeed active. */ - if (ieee80211_vif_is_mld(&sdata->vif)) { - if (req->tsf_report_link_id >= 0) { - if (!(sdata->vif.active_links & - BIT(req->tsf_report_link_id))) - return -EINVAL; - } else { - req->tsf_report_link_id = - __ffs(sdata->vif.active_links); - } - } + if (ieee80211_vif_is_mld(&sdata->vif) && req->tsf_report_link_id >= 0 && + !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) + return -EINVAL; if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 327c74e296e2..b2de4c6fb808 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -155,6 +155,7 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_eht_operation _oper; struct ieee80211_eht_operation_info _oper_info; } __packed eht; + const struct ieee80211_eht_operation *eht_oper; if (conn->mode < IEEE80211_CONN_MODE_HE) { chandef->chan = NULL; @@ -203,19 +204,18 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, } if (conn->mode < IEEE80211_CONN_MODE_EHT) { - if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, - NULL, chandef)) - chandef->chan = NULL; + eht_oper = NULL; } else { eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT; eht._oper_info.control = he._6ghz_oper.control; eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0; eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1; - - if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, - &eht._oper, chandef)) - chandef->chan = NULL; + eht_oper = &eht._oper; } + + if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, + eht_oper, chandef)) + chandef->chan = NULL; } int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, @@ -348,6 +348,10 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, new_chandef = csa_ie->chanreq.oper; /* and update the width accordingly */ ieee80211_chandef_eht_oper(&bwi->info, &new_chandef); + + if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT) + new_chandef.punctured = + get_unaligned_le16(bwi->info.optional); } else if (!wide_bw_chansw_ie || !wbcs_elem_to_chandef(wide_bw_chansw_ie, &new_chandef)) { if (!ieee80211_operating_class_to_chandef(new_op_class, new_chan, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index a52fb76386d0..bd5e2f7146f6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -886,7 +886,7 @@ void sta_info_stop(struct ieee80211_local *local); /** * __sta_info_flush - flush matching STA entries from the STA table * - * Returns the number of removed STA entries. + * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @vlans: if the given interface is an AP interface, also flush VLANs @@ -900,7 +900,7 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, /** * sta_info_flush - flush matching STA entries from the STA table * - * Returns the number of removed STA entries. + * Return: the number of removed STA entries. * * @sdata: sdata to remove all stations from * @link_id: if given (>=0), all those STA entries using @link_id only diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 1708b33cdc5e..dd8f857a1fbc 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2023 Intel Corporation + * Copyright 2021-2024 Intel Corporation */ #include <linux/export.h> @@ -696,6 +696,23 @@ static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata, wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); } +static void +ieee80211_handle_teardown_ttlm_status(struct ieee80211_sub_if_data *sdata, + bool acked) +{ + if (!sdata || !ieee80211_sdata_running(sdata)) + return; + + if (!acked) + return; + + if (sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->u.mgd.teardown_ttlm_work); +} + static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) @@ -773,6 +790,9 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_handle_smps_status(sdata, acked, info->status_data); break; + case IEEE80211_STATUS_TYPE_NEG_TTLM: + ieee80211_handle_teardown_ttlm_status(sdata, acked); + break; } rcu_read_unlock(); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index cfd0a62d0152..f861d99e5f05 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1609,8 +1609,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) local->cparams.target = MS2TIME(20); local->cparams.ecn = true; - local->cvars = kcalloc(fq->flows_cnt, sizeof(local->cvars[0]), - GFP_KERNEL); + local->cvars = kvcalloc(fq->flows_cnt, sizeof(local->cvars[0]), + GFP_KERNEL); if (!local->cvars) { spin_lock_bh(&fq->lock); fq_reset(fq, fq_skb_free_func); @@ -1630,7 +1630,7 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; - kfree(local->cvars); + kvfree(local->cvars); local->cvars = NULL; spin_lock_bh(&fq->lock); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index a237cbcf7b49..0b893e958959 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1932,6 +1932,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) old); } + sdata->restart_active_links = active_links; + for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) { @@ -2059,9 +2061,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(1); break; } - - if (active_links) - ieee80211_set_active_links(&sdata->vif, active_links); } ieee80211_recalc_ps(local); @@ -2102,6 +2101,20 @@ int ieee80211_reconfig(struct ieee80211_local *local) list_for_each_entry(sdata, &local->interfaces, list) ieee80211_reenable_keys(sdata); + /* re-enable multi-link for client interfaces */ + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->restart_active_links) + ieee80211_set_active_links(&sdata->vif, + sdata->restart_active_links); + /* + * If a link switch was scheduled before the restart, and ran + * before reconfig, it will do nothing, so re-schedule. + */ + if (sdata->desired_active_links) + wiphy_work_queue(sdata->local->hw.wiphy, + &sdata->activate_links_work); + } + /* Reconfigure sched scan if it was interrupted by FW restart */ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); @@ -3136,6 +3149,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, } else { ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &he_chandef); + he_chandef.punctured = + ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); } if (!cfg80211_chandef_valid(&he_chandef)) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index e40529b8c5c9..047a33797020 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -895,7 +895,8 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -911,6 +912,9 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128) @@ -1040,7 +1044,8 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) info = IEEE80211_SKB_CB(skb); - if (info->control.hw_key) + if (info->control.hw_key && + !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) @@ -1056,6 +1061,9 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) bip_ipn_set64(mmie->sequence_number, pn64); + if (info->control.hw_key) + return TX_CONTINUE; + bip_aad(skb, aad); hdr = (struct ieee80211_hdr *)skb->data; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 6dab883a08dd..2dc7a908a6bb 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -594,7 +594,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); - rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); + rt = ip_route_output(net, daddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return ERR_CAST(rt); @@ -1154,7 +1154,7 @@ static int mpls_netconf_fill_devconf(struct sk_buff *skb, struct mpls_dev *mdev, if ((all || type == NETCONFA_INPUT) && nla_put_s32(skb, NETCONFA_INPUT, - mdev->input_enabled) < 0) + READ_ONCE(mdev->input_enabled)) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -1303,11 +1303,12 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct hlist_head *head; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; struct net_device *dev; struct mpls_dev *mdev; - int idx, s_idx; - int h, s_h; + int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; @@ -1324,40 +1325,23 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, } } - s_h = cb->args[0]; - s_idx = idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - rcu_read_lock(); - cb->seq = net->dev_base_seq; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - mdev = mpls_dev_get(dev); - if (!mdev) - goto cont; - if (mpls_netconf_fill_devconf(skb, mdev, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, - RTM_NEWNETCONF, - NLM_F_MULTI, - NETCONFA_ALL) < 0) { - rcu_read_unlock(); - goto done; - } - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); -cont: - idx++; - } - rcu_read_unlock(); + rcu_read_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + mdev = mpls_dev_get(dev); + if (!mdev) + continue; + err = mpls_netconf_fill_devconf(skb, mdev, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, + RTM_NEWNETCONF, + NLM_F_MULTI, + NETCONFA_ALL); + if (err < 0) + break; } -done: - cb->args[0] = h; - cb->args[1] = idx; + rcu_read_unlock(); - return skb->len; + return err; } #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ @@ -1393,13 +1377,13 @@ static const struct ctl_table mpls_dev_table[] = { .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, - { } }; static int mpls_dev_sysctl_register(struct net_device *dev, struct mpls_dev *mdev) { char path[sizeof("net/mpls/conf/") + IFNAMSIZ]; + size_t table_size = ARRAY_SIZE(mpls_dev_table); struct net *net = dev_net(dev); struct ctl_table *table; int i; @@ -1411,7 +1395,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++) { + for (i = 0; i < table_size; i++) { table[i].data = (char *)mdev + (uintptr_t)table[i].data; table[i].extra1 = mdev; table[i].extra2 = net; @@ -1419,8 +1403,7 @@ static int mpls_dev_sysctl_register(struct net_device *dev, snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); - mdev->sysctl = register_net_sysctl_sz(net, path, table, - ARRAY_SIZE(mpls_dev_table)); + mdev->sysctl = register_net_sysctl_sz(net, path, table, table_size); if (!mdev->sysctl) goto free; @@ -1438,7 +1421,7 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev, struct mpls_dev *mdev) { struct net *net = dev_net(dev); - struct ctl_table *table; + const struct ctl_table *table; if (!mdev->sysctl) return; @@ -2669,11 +2652,11 @@ static const struct ctl_table mpls_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &ttl_max, }, - { } }; static int mpls_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; @@ -2689,11 +2672,11 @@ static int mpls_net_init(struct net *net) /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, - ARRAY_SIZE(mpls_table)); + table_size); if (net->mpls.ctl == NULL) { kfree(table); return -ENOMEM; @@ -2706,7 +2689,7 @@ static void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; - struct ctl_table *table; + const struct ctl_table *table; unsigned int index; table = net->mpls.ctl->ctl_table_arg; @@ -2773,7 +2756,8 @@ static int __init mpls_init(void) mpls_getroute, mpls_dump_routes, 0); rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, - mpls_netconf_dump_devconf, 0); + mpls_netconf_dump_devconf, + RTNL_FLAG_DUMP_UNLOCKED); err = ipgre_tunnel_encap_add_mpls_ops(); if (err) pr_err("Can't add mpls over gre tunnel ops\n"); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 8fc790f2a01b..4385fd3b13be 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -81,7 +81,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; + rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; @@ -90,7 +90,7 @@ static int mpls_xmit(struct sk_buff *skb) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; - rt6 = (struct rt6_info *)dst; + rt6 = dst_rt6_info(dst); } else { goto drop; } diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 2963ba84e2ee..49abf80b3fad 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -193,7 +193,6 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -235,7 +234,7 @@ err_alloc: static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { - struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; + const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 27ca42c77b02..8e8dcfbc2993 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1068,6 +1068,7 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } + msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 965eb69dc5de..bb8f96f2b86f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -20,6 +20,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" @@ -706,6 +707,8 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } while (more_data_avail); + if (moved > 0) + msk->last_data_recv = tcp_jiffies32; *bytes += moved; return done; } @@ -1270,7 +1273,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { + if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } @@ -1556,6 +1559,8 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk, err = copied; out: + if (err > 0) + msk->last_data_sent = tcp_jiffies32; return err; } @@ -2056,7 +2061,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - tcp_sk(ssk)->window_clamp = window_clamp; + WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } @@ -2565,7 +2570,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { - tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); @@ -2793,6 +2798,9 @@ static void __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; + msk->last_data_sent = tcp_jiffies32; + msk->last_data_recv = tcp_jiffies32; + msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a10ebf3ee10a..cfc5f9c3f113 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -12,6 +12,7 @@ #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> +#include <net/rstreason.h> #include "mptcp_pm_gen.h" @@ -282,6 +283,9 @@ struct mptcp_sock { u64 bytes_acked; u64 snd_una; u64 wnd_end; + u32 last_data_sent; + u32 last_data_recv; + u32 last_ack_recv; unsigned long timer_ival; u32 token; int rmem_released; @@ -558,7 +562,7 @@ struct mptcp_subflow_context { static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; @@ -578,6 +582,43 @@ mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) WRITE_ONCE(subflow->local_id, -1); } +/* Convert reset reasons in MPTCP to enum sk_rst_reason type */ +static inline enum sk_rst_reason +sk_rst_convert_mptcp_reason(u32 reason) +{ + switch (reason) { + case MPTCP_RST_EUNSPEC: + return SK_RST_REASON_MPTCP_RST_EUNSPEC; + case MPTCP_RST_EMPTCP: + return SK_RST_REASON_MPTCP_RST_EMPTCP; + case MPTCP_RST_ERESOURCE: + return SK_RST_REASON_MPTCP_RST_ERESOURCE; + case MPTCP_RST_EPROHIBIT: + return SK_RST_REASON_MPTCP_RST_EPROHIBIT; + case MPTCP_RST_EWQ2BIG: + return SK_RST_REASON_MPTCP_RST_EWQ2BIG; + case MPTCP_RST_EBADPERF: + return SK_RST_REASON_MPTCP_RST_EBADPERF; + case MPTCP_RST_EMIDDLEBOX: + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; + default: + /* It should not happen, or else errors may occur + * in MPTCP layer + */ + return SK_RST_REASON_ERROR; + } +} + +static inline void +mptcp_send_active_reset_reason(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + enum sk_rst_reason reason; + + reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); + tcp_send_active_reset(sk, GFP_ATOMIC, reason); +} + static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 73fdf423de44..1fea43f5b6f3 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -898,6 +898,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) struct sock *sk = (struct sock *)msk; u32 flags = 0; bool slow; + u32 now; memset(info, 0, sizeof(*info)); @@ -926,11 +927,6 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - mptcp_data_lock(sk); - info->mptcpi_snd_una = msk->snd_una; - info->mptcpi_rcv_nxt = msk->ack_seq; - info->mptcpi_bytes_acked = msk->bytes_acked; - mptcp_data_unlock(sk); slow = lock_sock_fast(sk); info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); @@ -942,7 +938,17 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_retrans = msk->bytes_retrans; info->mptcpi_subflows_total = info->mptcpi_subflows + __mptcp_has_initial_subflow(msk); + now = tcp_jiffies32; + info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); + info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv); unlock_sock_fast(sk, slow); + + mptcp_data_lock(sk); + info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -1523,7 +1529,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, space); - tcp_sk(ssk)->window_clamp = val; + WRITE_ONCE(tcp_sk(ssk)->window_clamp, val); unlock_sock_fast(ssk, slow); } return 0; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6042a47da61b..c1d13e555d10 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -20,6 +20,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> + #include "protocol.h" #include "mib.h" @@ -150,8 +151,10 @@ static int subflow_check_req(struct request_sock *req, /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; + } #endif mptcp_get_options(skb, &mp_opt); @@ -219,6 +222,7 @@ again: ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); @@ -227,10 +231,12 @@ again: subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { - if (mptcp_can_accept_new_subflow(subflow_req->msk)) - subflow_init_req_cookie_join_save(subflow_req, skb); - else + if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; + } + + subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, @@ -281,10 +287,21 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); +static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) +{ + const struct mptcp_ext *mpext = mptcp_get_ext(skb); + + if (!mpext) + return SK_RST_REASON_NOT_SPECIFIED; + + return sk_rst_convert_mptcp_reason(mpext->reset_reason); +} + static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -292,7 +309,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -302,7 +319,8 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp_request_sock_ops.send_reset(sk, skb); + tcp_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } @@ -351,7 +369,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct request_sock *req) + struct request_sock *req, + u32 tw_isn) { struct dst_entry *dst; int err; @@ -359,7 +378,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); - dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; @@ -369,7 +388,8 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk, dst_release(dst); if (!req->syncookie) - tcp6_request_sock_ops.send_reset(sk, skb); + tcp6_request_sock_ops.send_reset(sk, skb, + mptcp_get_rst_reason(skb)); return NULL; } #endif @@ -405,7 +425,7 @@ void mptcp_subflow_reset(struct sock *ssk) /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); @@ -774,6 +794,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; + enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; @@ -873,13 +894,18 @@ create_child: ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } - if (!mptcp_finish_join(child)) + if (!mptcp_finish_join(child)) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); + + subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; + } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; @@ -887,7 +913,7 @@ create_child: } /* check for expected invariant - should never trigger, just help - * catching eariler subtle bugs + * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || @@ -899,7 +925,8 @@ dispose_child: tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); - req->rsk_ops->send_reset(sk, skb); + reason = mptcp_get_rst_reason(skb); + req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; @@ -1234,7 +1261,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; - /* greceful failure can happen only on the MPC subflow */ + /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return; @@ -1336,7 +1363,7 @@ reset: tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - tcp_send_active_reset(ssk, GFP_ATOMIC); + mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a2c16b501087..c7a8a08b7308 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1550,6 +1550,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (!dest) goto unk; if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(flags); __be16 type; /* Only support version 0 and C (csum) */ @@ -1560,7 +1561,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, if (type != htons(ETH_P_IP)) goto unk; *proto = IPPROTO_IPIP; - return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + + gre_flags_to_tnl_flags(flags, greh->flags); + + return gre_calc_hlen(flags); } unk: diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 143a341bbc0a..b6d0dcf3a5c3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -94,6 +94,7 @@ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; + int amemthresh; int nomem; int to_change = -1; @@ -105,7 +106,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < ipvs->sysctl_amemthresh); + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); + nomem = (availmem < amemthresh); local_bh_disable(); @@ -145,9 +147,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 1: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; @@ -155,9 +156,8 @@ static void update_defense_level(struct netns_ipvs *ipvs) break; case 2: if (nomem) { - ipvs->drop_rate = ipvs->drop_counter - = ipvs->sysctl_amemthresh / - (ipvs->sysctl_amemthresh-availmem); + ipvs->drop_counter = amemthresh / (amemthresh - availmem); + ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; @@ -2263,7 +2263,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -4270,6 +4269,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); + bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); @@ -4284,12 +4284,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; - ctl_table_size = 0; - } } else tbl = vs_vars; /* Initialize sysctl defaults */ @@ -4315,10 +4309,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; + ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; + ipvs->sysctl_sync_sock_size = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; @@ -4341,15 +4342,22 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; + if (unpriv) + tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 8ceec7a2fa8f..2423513d701d 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -123,7 +123,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -563,10 +562,8 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblc_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblc_ctl_table = vs_vars_table; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0fb64707213f..cdb1d4bf6761 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -294,7 +294,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -749,10 +748,8 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblcr_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 65e0259178da..3313bceb6cc9 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -180,7 +180,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (addr_type & IPV6_ADDR_LOOPBACK); old_rt_is_local = __ip_vs_is_local_route6( - (struct rt6_info *)skb_dst(skb)); + dst_rt6_info(skb_dst(skb))); } else #endif { @@ -318,7 +318,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rtable *) dest_dst->dst_cache; + rt = dst_rtable(dest_dst->dst_cache); else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); @@ -390,10 +390,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { @@ -481,7 +481,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) - rt = (struct rt6_info *) dest_dst->dst_cache; + rt = dst_rt6_info(dest_dst->dst_cache); else { u32 cookie; @@ -501,7 +501,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, ip_vs_dest_dst_free(dest_dst); goto err_unreach; } - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); @@ -517,7 +517,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, rt_mode); if (!dst) goto err_unreach; - rt = (struct rt6_info *) dst; + rt = dst_rt6_info(dst); } local = __ip_vs_is_local_route6(rt); @@ -553,10 +553,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { @@ -862,7 +862,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_RT_MODE_RDR); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed @@ -1082,11 +1082,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb, { __be16 proto = *next_protocol == IPPROTO_IPIP ? htons(ETH_P_IP) : htons(ETH_P_IPV6); - __be16 tflags = 0; + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t hdrlen; if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); hdrlen = gre_calc_hlen(tflags); gre_build_header(skb, hdrlen, tflags, proto, 0, 0); @@ -1165,11 +1165,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1288,7 +1288,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (local) return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); tdev = rt->dst.dev; /* @@ -1310,11 +1310,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom += sizeof(struct udphdr) + gue_hdrlen; } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; size_t gre_hdrlen; - __be16 tflags = 0; if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) - tflags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tflags); gre_hdrlen = gre_calc_hlen(tflags); max_headroom += gre_hdrlen; @@ -1590,7 +1590,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); if (local < 0) goto tx_error; - rt = (struct rt6_info *) skb_dst(skb); + rt = dst_rt6_info(skb_dst(skb)); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee98ce5b816..74112e9c5dab 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -616,11 +616,9 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LWTUNNEL, #endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", @@ -957,7 +955,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, #endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -968,7 +965,6 @@ static struct ctl_table nf_ct_netfilter_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, @@ -1122,7 +1118,7 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct ctl_table *table; + const struct ctl_table *table; table = cnet->sysctl_header->ctl_table_arg; unregister_net_sysctl_table(cnet->sysctl_header); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index a0571339239c..5c1ff07eaee0 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -77,12 +77,8 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { - const struct rt6_info *rt; - - if (flow_tuple->l3proto == NFPROTO_IPV6) { - rt = (const struct rt6_info *)flow_tuple->dst_cache; - return rt6_get_cookie(rt); - } + if (flow_tuple->l3proto == NFPROTO_IPV6) + return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 5383bed3d3e0..c2c005234dcd 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -434,7 +434,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -446,7 +446,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rtable *)tuplehash->tuple.dst_cache; + rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); @@ -729,7 +729,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; @@ -741,7 +741,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + rt = dst_rt6_info(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 370f8231385c..769fd7680fac 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -395,7 +395,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -406,7 +406,6 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int nf_log_proc_dostring(struct ctl_table *table, int write, @@ -514,7 +513,7 @@ err_alloc: static void netfilter_log_sysctl_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->nf.nf_log_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_log_dir_header); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index c9fbe0f707b5..4abf660c7baf 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -427,6 +427,9 @@ replay_abort: nfnl_unlock(subsys_id); + if (nlh->nlmsg_flags & NLM_F_ACK) + nfnl_err_add(&err_list, nlh, 0, &extack); + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -573,6 +576,8 @@ done: } else if (err) { ss->abort(net, oskb, NFNL_ABORT_NONE); netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } else if (nlh->nlmsg_flags & NLM_F_ACK) { + nfnl_err_add(&err_list, nlh, 0, &extack); } } else { enum nfnl_abort_action abort_action; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 24d977138572..14d88394bcb7 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr, if (nft_pf(pkt) != NFPROTO_IPV4) goto err; - *dest = (__force u32)rt_nexthop((const struct rtable *)dst, + *dest = (__force u32)rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr); break; case NFT_RT_NEXTHOP6: if (nft_pf(pkt) != NFPROTO_IPV6) goto err; - memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + memcpy(dest, rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index f735d79d8be5..60a76e6e348e 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -174,8 +174,8 @@ struct nft_tunnel_opts { struct erspan_metadata erspan; u8 data[IP_TUNNEL_OPTS_MAX]; } u; + IP_TUNNEL_DECLARE_FLAGS(flags); u32 len; - __be16 flags; }; struct nft_tunnel_obj { @@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP])); opts->len = sizeof(struct vxlan_metadata); - opts->flags = TUNNEL_VXLAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags); return 0; } @@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, opts->u.erspan.version = version; opts->len = sizeof(struct erspan_metadata); - opts->flags = TUNNEL_ERSPAN_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags); return 0; } @@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, opt->length = data_len / 4; opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); - opts->flags = TUNNEL_GENEVE_OPT; + ip_tunnel_flags_zero(opts->flags); + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags); return 0; } @@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct nft_tunnel_opts *opts) { struct nlattr *nla; - __be16 type = 0; int err, rem; + u32 type = 0; err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, nft_tunnel_opts_policy, NULL); @@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_vxlan_init(nla, opts); if (err) return err; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_ERSPAN: if (type) @@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, err = nft_tunnel_obj_erspan_init(nla, opts); if (err) return err; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; case NFTA_TUNNEL_KEY_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) return -EINVAL; err = nft_tunnel_obj_geneve_init(nla, opts); if (err) return err; - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; default: return -EOPNOTSUPP; @@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, memset(&info, 0, sizeof(info)); info.mode = IP_TUNNEL_INFO_TX; info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID])); - info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); + __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags); if (tb[NFTA_TUNNEL_KEY_IP]) { err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info); @@ -483,11 +488,12 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX) - info.key.tun_flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT) - info.key.tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + info.key.tun_flags); if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER) - info.key.tun_flags |= TUNNEL_SEQ; + __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags); } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); @@ -583,7 +589,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, if (!nest) return -1; - if (opts->flags & TUNNEL_VXLAN_OPT) { + if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); if (!inner) goto failure; @@ -591,7 +597,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, htonl(opts->u.vxlan.gbp))) goto inner_failure; nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) { inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); if (!inner) goto failure; @@ -613,7 +619,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); - } else if (opts->flags & TUNNEL_GENEVE_OPT) { + } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) { struct geneve_opt *opt; int offset = 0; @@ -658,11 +664,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb, { u32 flags = 0; - if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_DONT_FRAGMENT; - if (!(info->key.tun_flags & TUNNEL_CSUM)) + if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_ZERO_CSUM_TX; - if (info->key.tun_flags & TUNNEL_SEQ) + if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags)) flags |= NFT_TUNNEL_F_SEQ_NUMBER; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7554803218a2..fa9c090cf629 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -59,7 +59,6 @@ #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> -#include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> @@ -73,6 +72,7 @@ #include <trace/events/netlink.h> #include "af_netlink.h" +#include "genetlink.h" struct listeners { struct rcu_head rcu; @@ -2165,6 +2165,69 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla } EXPORT_SYMBOL(__nlmsg_put); +static size_t +netlink_ack_tlv_len(struct netlink_sock *nlk, int err, + const struct netlink_ext_ack *extack) +{ + size_t tlvlen; + + if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) + return 0; + + tlvlen = 0; + if (extack->_msg) + tlvlen += nla_total_size(strlen(extack->_msg) + 1); + if (extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); + + /* Following attributes are only reported as error (not warning) */ + if (!err) + return tlvlen; + + if (extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + if (extack->policy) + tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); + if (extack->miss_type) + tlvlen += nla_total_size(sizeof(u32)); + if (extack->miss_nest) + tlvlen += nla_total_size(sizeof(u32)); + + return tlvlen; +} + +static void +netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, + const struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack) +{ + if (extack->_msg) + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, extack->cookie)); + + if (!err) + return; + + if (extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - (const u8 *)nlh)); + if (extack->policy) + netlink_policy_dump_write_attr(skb, extack->policy, + NLMSGERR_ATTR_POLICY); + if (extack->miss_type) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, + extack->miss_type)); + if (extack->miss_nest && + !WARN_ON((u8 *)extack->miss_nest < in_skb->data || + (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, + (u8 *)extack->miss_nest - (const u8 *)nlh)); +} + /* * It looks a bit ugly. * It would be better to create kernel thread. @@ -2175,6 +2238,7 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; + size_t extack_len; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); @@ -2184,10 +2248,14 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); - if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { + extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack); + if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; - if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) + if (skb_tailroom(skb) >= extack_len) { + netlink_ack_tlv_fill(cb->skb, skb, cb->nlh, + nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); + } } return 0; @@ -2406,69 +2474,6 @@ error_free: } EXPORT_SYMBOL(__netlink_dump_start); -static size_t -netlink_ack_tlv_len(struct netlink_sock *nlk, int err, - const struct netlink_ext_ack *extack) -{ - size_t tlvlen; - - if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) - return 0; - - tlvlen = 0; - if (extack->_msg) - tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (extack->cookie_len) - tlvlen += nla_total_size(extack->cookie_len); - - /* Following attributes are only reported as error (not warning) */ - if (!err) - return tlvlen; - - if (extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - if (extack->policy) - tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); - if (extack->miss_type) - tlvlen += nla_total_size(sizeof(u32)); - if (extack->miss_nest) - tlvlen += nla_total_size(sizeof(u32)); - - return tlvlen; -} - -static void -netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, - struct nlmsghdr *nlh, int err, - const struct netlink_ext_ack *extack) -{ - if (extack->_msg) - WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); - if (extack->cookie_len) - WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, - extack->cookie_len, extack->cookie)); - - if (!err) - return; - - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - (u8 *)nlh)); - if (extack->policy) - netlink_policy_dump_write_attr(skb, extack->policy, - NLMSGERR_ATTR_POLICY); - if (extack->miss_type) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, - extack->miss_type)); - if (extack->miss_nest && - !WARN_ON((u8 *)extack->miss_nest < in_skb->data || - (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, - (u8 *)extack->miss_nest - (u8 *)nlh)); -} - void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 3b7666944b11..feb54c63a116 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -22,6 +22,8 @@ #include <net/sock.h> #include <net/genetlink.h> +#include "genetlink.h" + static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); diff --git a/net/netlink/genetlink.h b/net/netlink/genetlink.h new file mode 100644 index 000000000000..89bd9d2631c3 --- /dev/null +++ b/net/netlink/genetlink.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_GENETLINK_H +#define __NET_GENETLINK_H + +#include <linux/wait.h> + +/* for synchronisation between af_netlink and genetlink */ +extern atomic_t genl_sk_destructing_cnt; +extern wait_queue_head_t genl_sk_destructing_waitq; + +#endif /* __LINUX_GENERIC_NETLINK_H */ diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index 79fb2d3f477b..7dc0fa628f2e 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -140,7 +140,6 @@ static struct ctl_table nr_table[] = { .extra1 = &min_reset, .extra2 = &max_reset }, - { } }; int __init nr_register_sysctl(void) diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index aa1dbf654c3e..dd2ce73a24fb 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -969,8 +969,7 @@ static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) int rc; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || - !info->attrs[NFC_ATTR_TARGET_INDEX]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); @@ -1018,8 +1017,7 @@ static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg = NULL; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || - !info->attrs[NFC_ATTR_FIRMWARE_NAME]) + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 11c69415c605..99d72543abd3 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -15,7 +15,6 @@ #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> -#include <linux/genetlink.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ebc5728aab4e..f224d9bcea5e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -152,6 +152,13 @@ static void update_range(struct sw_flow_match *match, sizeof((match)->key->field)); \ } while (0) +#define SW_FLOW_KEY_BITMAP_COPY(match, field, value_p, nbits, is_mask) ({ \ + update_range(match, offsetof(struct sw_flow_key, field), \ + bitmap_size(nbits), is_mask); \ + bitmap_copy(is_mask ? (match)->mask->key.field : (match)->key->field, \ + value_p, nbits); \ +}) + static bool match_validate(const struct sw_flow_match *match, u64 key_attrs, u64 mask_attrs, bool log) { @@ -670,8 +677,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, bool log) { bool ttl = false, ipv4 = false, ipv6 = false; + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; bool info_bridge_mode = false; - __be16 tun_flags = 0; int opts_type = 0; struct nlattr *a; int rem; @@ -697,7 +704,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_ID: SW_FLOW_KEY_PUT(match, tun_key.tun_id, nla_get_be64(a), is_mask); - tun_flags |= TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, @@ -729,10 +736,10 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, ttl = true; break; case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_TP_SRC: SW_FLOW_KEY_PUT(match, tun_key.tp_src, @@ -743,7 +750,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, nla_get_be16(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_OAM: - tun_flags |= TUNNEL_OAM; + __set_bit(IP_TUNNEL_OAM_BIT, tun_flags); break; case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: if (opts_type) { @@ -755,7 +762,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: @@ -768,7 +775,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_PAD: @@ -784,7 +791,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, if (err) return err; - tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_flags); opts_type = type; break; case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE: @@ -798,7 +805,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, } } - SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + SW_FLOW_KEY_BITMAP_COPY(match, tun_key.tun_flags, tun_flags, + __IP_TUNNEL_FLAG_NUM, is_mask); if (is_mask) SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); else @@ -823,13 +831,15 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, } if (ipv4) { if (info_bridge_mode) { + __clear_bit(IP_TUNNEL_KEY_BIT, tun_flags); + if (match->key->tun_key.u.ipv4.src || match->key->tun_key.u.ipv4.dst || match->key->tun_key.tp_src || match->key->tun_key.tp_dst || match->key->tun_key.ttl || match->key->tun_key.tos || - tun_flags & ~TUNNEL_KEY) { + !ip_tunnel_flags_empty(tun_flags)) { OVS_NLERR(log, "IPv4 tun info is not correct"); return -EINVAL; } @@ -874,7 +884,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len, unsigned short tun_proto, u8 mode) { - if (output->tun_flags & TUNNEL_KEY && + if (test_bit(IP_TUNNEL_KEY_BIT, output->tun_flags) && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, OVS_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; @@ -910,10 +920,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_CSUM) && + if (test_bit(IP_TUNNEL_CSUM_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (output->tp_src && @@ -922,18 +932,20 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, if (output->tp_dst && nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_OAM) && + if (test_bit(IP_TUNNEL_OAM_BIT, output->tun_flags) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (swkey_tun_opts_len) { - if (output->tun_flags & TUNNEL_GENEVE_OPT && + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, output->tun_flags) && nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_VXLAN_OPT && + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, + output->tun_flags) && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + output->tun_flags) && nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, swkey_tun_opts_len, tun_opts)) return -EMSGSIZE; @@ -2029,7 +2041,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if ((swkey->tun_proto || is_mask)) { const void *opts = NULL; - if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + if (ip_tunnel_is_options_present(output->tun_key.tun_flags)) opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); if (ip_tun_to_nlattr(skb, &output->tun_key, opts, @@ -2752,7 +2764,8 @@ static int validate_geneve_opts(struct sw_flow_key *key) opts_len -= len; } - key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + if (crit_opt) + __set_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_key.tun_flags); return 0; } @@ -2760,6 +2773,7 @@ static int validate_geneve_opts(struct sw_flow_key *key) static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { + IP_TUNNEL_DECLARE_FLAGS(dst_opt_type) = { }; struct sw_flow_match match; struct sw_flow_key key; struct metadata_dst *tun_dst; @@ -2767,9 +2781,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; - __be16 dst_opt_type; - dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2781,13 +2793,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; - dst_opt_type = TUNNEL_GENEVE_OPT; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, dst_opt_type); break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: - dst_opt_type = TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, dst_opt_type); break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - dst_opt_type = TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, dst_opt_type); break; } } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index ed11cd12b512..8bbf983cd244 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -11,7 +11,6 @@ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> -#include <linux/genetlink.h> #include <linux/skbuff.h> #include <linux/bits.h> diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 903537a5da22..91a11067e458 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -82,6 +82,13 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) err = -ENODEV; goto error_free_vport; } + /* Ensure that the device exists and that the provided + * name is not one of its aliases. + */ + if (strcmp(name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 18f616f487ea..8c6d3fbb4ed8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3800,28 +3800,30 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, case PACKET_TX_RING: { union tpacket_req_u req_u; - int len; + ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: - len = sizeof(req_u.req); + if (optlen < sizeof(req_u.req)) + break; + ret = copy_from_sockptr(&req_u.req, optval, + sizeof(req_u.req)) ? + -EINVAL : 0; break; case TPACKET_V3: default: - len = sizeof(req_u.req3); + if (optlen < sizeof(req_u.req3)) + break; + ret = copy_from_sockptr(&req_u.req3, optval, + sizeof(req_u.req3)) ? + -EINVAL : 0; break; } - if (optlen < len) { - ret = -EINVAL; - } else { - if (copy_from_sockptr(&req_u.req, optval, len)) - ret = -EFAULT; - else - ret = packet_set_ring(sk, &req_u, 0, - optname == PACKET_TX_RING); - } + if (!ret) + ret = packet_set_ring(sk, &req_u, 0, + optname == PACKET_TX_RING); release_sock(sk); return ret; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index dd4c7e9a634f..7008d402499d 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -178,7 +178,7 @@ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_u8(skb, RTA_DST, dst) || - nla_put_u32(skb, RTA_OIF, dev->ifindex)) + nla_put_u32(skb, RTA_OIF, READ_ONCE(dev->ifindex))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -263,6 +263,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + int err = 0; u8 addr; rcu_read_lock(); @@ -272,16 +273,16 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) continue; - if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE) < 0) - goto out; + err = fill_route(skb, dev, addr << 2, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE); + if (err < 0) + break; } - -out: rcu_read_unlock(); cb->args[0] = addr; - return skb->len; + return err; } int __init phonet_netlink_register(void) @@ -301,6 +302,6 @@ int __init phonet_netlink_register(void) rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0); rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, - NULL, route_dumpit, 0); + NULL, route_dumpit, RTNL_FLAG_DUMP_UNLOCKED); return 0; } diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 0d0bf41381c2..82fc22467a09 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -81,7 +81,6 @@ static struct ctl_table phonet_table[] = { .mode = 0644, .proc_handler = proc_local_port_range, }, - { } }; int __init phonet_sysctl_init(void) diff --git a/net/psample/psample.c b/net/psample/psample.c index ddd211a151d0..a5d9b8446f77 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -221,7 +221,7 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, const struct ip_tunnel_key *tun_key = &tun_info->key; int tun_opts_len = tun_info->options_len; - if (tun_key->tun_flags & TUNNEL_KEY && + if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) && nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, PSAMPLE_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; @@ -257,10 +257,10 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, return -EMSGSIZE; if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && + if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; if (tun_key->tp_src && @@ -269,15 +269,16 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, if (tun_key->tp_dst && nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_OAM) && + if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) && nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; if (tun_opts_len) { - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT && + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; - else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT && + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_key->tun_flags) && nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, tun_opts_len, tun_opts)) return -EMSGSIZE; @@ -314,7 +315,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) int tun_opts_len = tun_info->options_len; int sum = nla_total_size(0); /* PSAMPLE_ATTR_TUNNEL */ - if (tun_key->tun_flags & TUNNEL_KEY) + if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) sum += nla_total_size_64bit(sizeof(u64)); if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) @@ -337,20 +338,21 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) if (tun_key->tos) sum += nla_total_size(sizeof(u8)); sum += nla_total_size(sizeof(u8)); /* TTL */ - if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags)) sum += nla_total_size(0); - if (tun_key->tun_flags & TUNNEL_CSUM) + if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_key->tp_src) sum += nla_total_size(sizeof(u16)); if (tun_key->tp_dst) sum += nla_total_size(sizeof(u16)); - if (tun_key->tun_flags & TUNNEL_OAM) + if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags)) sum += nla_total_size(0); if (tun_opts_len) { - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); - else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT) + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + tun_key->tun_flags)) sum += nla_total_size(tun_opts_len); } diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index 9ced13c0627a..69f53625a049 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -118,6 +118,51 @@ static const struct mhi_device_id qcom_mhi_qrtr_id_table[] = { }; MODULE_DEVICE_TABLE(mhi, qcom_mhi_qrtr_id_table); +static int __maybe_unused qcom_mhi_qrtr_pm_suspend_late(struct device *dev) +{ + struct mhi_device *mhi_dev = container_of(dev, struct mhi_device, dev); + enum mhi_state state; + + state = mhi_get_mhi_state(mhi_dev->mhi_cntrl); + /* + * If the device is in suspend state, then no need for the + * client driver to unprepare the channels. + */ + if (state == MHI_STATE_M3) + return 0; + + mhi_unprepare_from_transfer(mhi_dev); + + return 0; +} + +static int __maybe_unused qcom_mhi_qrtr_pm_resume_early(struct device *dev) +{ + struct mhi_device *mhi_dev = container_of(dev, struct mhi_device, dev); + enum mhi_state state; + int rc; + + state = mhi_get_mhi_state(mhi_dev->mhi_cntrl); + /* + * If the device is in suspend state, we won't unprepare channels + * in suspend callback, therefore no need to prepare channels when + * resume. + */ + if (state == MHI_STATE_M3) + return 0; + + rc = mhi_prepare_for_transfer_autoqueue(mhi_dev); + if (rc) + dev_err(dev, "failed to prepare for autoqueue transfer %d\n", rc); + + return rc; +} + +static const struct dev_pm_ops qcom_mhi_qrtr_pm_ops = { + SET_LATE_SYSTEM_SLEEP_PM_OPS(qcom_mhi_qrtr_pm_suspend_late, + qcom_mhi_qrtr_pm_resume_early) +}; + static struct mhi_driver qcom_mhi_qrtr_driver = { .probe = qcom_mhi_qrtr_probe, .remove = qcom_mhi_qrtr_remove, @@ -126,6 +171,7 @@ static struct mhi_driver qcom_mhi_qrtr_driver = { .id_table = qcom_mhi_qrtr_id_table, .driver = { .name = "qcom_mhi_qrtr", + .pm = &qcom_mhi_qrtr_pm_ops, }, }; diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index e4e41b3afce7..2af678e71e3c 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -103,7 +103,6 @@ static struct ctl_table rds_ib_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_ib_sysctl_exit(void) diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index e381bbcd9cc1..025f518a4349 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -89,7 +89,6 @@ static struct ctl_table rds_sysctl_rds_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_sysctl_exit(void) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 2dba7505b414..d8111ac83bb6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -86,7 +86,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { .proc_handler = rds_tcp_skbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, - { } }; u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 4e32d659524e..84529886c2e6 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -156,14 +156,12 @@ err_destroy: return ret; } -static int rfkill_gpio_remove(struct platform_device *pdev) +static void rfkill_gpio_remove(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev); rfkill_unregister(rfkill->rfkill_dev); rfkill_destroy(rfkill->rfkill_dev); - - return 0; } #ifdef CONFIG_ACPI @@ -183,7 +181,7 @@ MODULE_DEVICE_TABLE(of, rfkill_of_match); static struct platform_driver rfkill_gpio_driver = { .probe = rfkill_gpio_probe, - .remove = rfkill_gpio_remove, + .remove_new = rfkill_gpio_remove, .driver = { .name = "rfkill_gpio", .acpi_match_table = ACPI_PTR(rfkill_acpi_match), diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c index d391d7758f52..d801315b7083 100644 --- a/net/rose/sysctl_net_rose.c +++ b/net/rose/sysctl_net_rose.c @@ -112,7 +112,6 @@ static struct ctl_table rose_table[] = { .extra1 = &min_window, .extra2 = &max_window }, - { } }; void __init rose_register_sysctl(void) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5222bc97d192..f4844683e120 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index c9bedd0e2d86..9bf9a1f6e4cb 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -127,7 +127,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, - { } }; int __init rxrpc_sysctl_init(void) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 1536f8b16f1b..af7c99845948 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -230,7 +230,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: - if (type && type != TUNNEL_GENEVE_OPT) { + if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } @@ -247,7 +247,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, dst_len -= opt_len; dst += opt_len; } - type = TUNNEL_GENEVE_OPT; + type = IP_TUNNEL_GENEVE_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: if (type) { @@ -259,7 +259,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_VXLAN_OPT; + type = IP_TUNNEL_VXLAN_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: if (type) { @@ -271,7 +271,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, if (opt_len < 0) return opt_len; opts_len += opt_len; - type = TUNNEL_ERSPAN_OPT; + type = IP_TUNNEL_ERSPAN_OPT_BIT; break; } } @@ -302,7 +302,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, switch (nla_type(nla_data(nla))) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_GENEVE_OPT; + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -310,7 +310,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_VXLAN_OPT; + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -318,7 +318,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, #endif case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: #if IS_ENABLED(CONFIG_INET) - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else @@ -363,6 +363,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; + IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *metadata = NULL; struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; @@ -371,7 +372,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, __be16 dst_port = 0; __be64 key_id = 0; int opts_len = 0; - __be16 flags = 0; u8 tos, ttl; int ret = 0; u32 index; @@ -412,16 +412,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); key_id = key32_to_tunnel_id(key32); - flags = TUNNEL_KEY; + __set_bit(IP_TUNNEL_KEY_BIT, flags); } - flags |= TUNNEL_CSUM; + __set_bit(IP_TUNNEL_CSUM_BIT, flags); if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) - flags &= ~TUNNEL_CSUM; + __clear_bit(IP_TUNNEL_CSUM_BIT, flags); if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG])) - flags |= TUNNEL_DONT_FRAGMENT; + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags); if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); @@ -663,15 +663,15 @@ static int tunnel_key_opts_dump(struct sk_buff *skb, if (!start) return -EMSGSIZE; - if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) { + } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_vxlan_opts_dump(skb, info); if (err) goto err_out; - } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) { + } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_erspan_opts_dump(skb, info); if (err) goto err_out; @@ -741,7 +741,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); - if (((key->tun_flags & TUNNEL_KEY) && + if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || @@ -749,8 +749,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM)) || - ((key->tun_flags & TUNNEL_DONT_FRAGMENT) && + !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) || + (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) && nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ca5676b2668e..17d97bbe890f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -410,12 +410,48 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } +static void tcf_maintain_bypass(struct tcf_block *block) +{ + int filtercnt = atomic_read(&block->filtercnt); + int skipswcnt = atomic_read(&block->skipswcnt); + bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; + + if (bypass_wanted != block->bypass_wanted) { +#ifdef CONFIG_NET_CLS_ACT + if (bypass_wanted) + static_branch_inc(&tcf_bypass_check_needed_key); + else + static_branch_dec(&tcf_bypass_check_needed_key); +#endif + block->bypass_wanted = bypass_wanted; + } +} + +static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) +{ + lockdep_assert_not_held(&block->cb_lock); + + down_write(&block->cb_lock); + if (*counted != add) { + if (add) { + atomic_inc(&block->filtercnt); + *counted = true; + } else { + atomic_dec(&block->filtercnt); + *counted = false; + } + } + tcf_maintain_bypass(block); + up_write(&block->cb_lock); +} + static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2367,6 +2403,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); + tcf_block_filter_cnt_update(block, &tp->counted, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -3483,6 +3520,8 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3491,6 +3530,8 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e1314674b4a9..fd9a6f20b60b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -28,6 +28,7 @@ #include <net/vxlan.h> #include <net/erspan.h> #include <net/gtp.h> +#include <net/pfcp.h> #include <net/tc_wrapper.h> #include <net/dst.h> @@ -741,6 +742,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_ENC_OPTS_GTP] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_PFCP] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -771,6 +773,12 @@ gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = { }; static const struct nla_policy +pfcp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID] = { .type = NLA_U64 }, +}; + +static const struct nla_policy mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, @@ -1419,6 +1427,44 @@ static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key, return sizeof(*sinfo); } +static int fl_set_pfcp_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1]; + struct pfcp_metadata *md; + int err; + + md = (struct pfcp_metadata *)&key->enc_opts.data[key->enc_opts.len]; + memset(md, 0xff, sizeof(*md)); + + if (!depth) + return sizeof(*md); + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_PFCP) { + NL_SET_ERR_MSG_MOD(extack, "Non-pfcp option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX, nla, + pfcp_opt_policy, extack); + if (err < 0) + return err; + + if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing tunnel key pfcp option type"); + return -EINVAL; + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) + md->type = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]); + + if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]) + md->seid = nla_get_be64(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]); + + return sizeof(*md); +} + static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1454,12 +1500,13 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, switch (nla_type(nla_opt_key)) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: if (key->enc_opts.dst_opt_type && - key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) { + key->enc_opts.dst_opt_type != + IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1470,7 +1517,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT; option_len = fl_set_geneve_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1489,7 +1536,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1500,7 +1547,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT; option_len = fl_set_vxlan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1519,7 +1566,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1530,7 +1577,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT; option_len = fl_set_erspan_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1550,7 +1597,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } option_len = 0; - key->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + key->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_key, key, key_depth, option_len, extack); @@ -1561,7 +1608,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, /* At the same time we need to parse through the mask * in order to verify exact and mask attribute lengths. */ - mask->enc_opts.dst_opt_type = TUNNEL_GTP_OPT; + mask->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT; option_len = fl_set_gtp_opt(nla_opt_msk, mask, msk_depth, option_len, extack); @@ -1575,6 +1622,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, return -EINVAL; } break; + case TCA_FLOWER_KEY_ENC_OPTS_PFCP: + if (key->enc_opts.dst_opt_type) { + NL_SET_ERR_MSG_MOD(extack, "Duplicate type for pfcp options"); + return -EINVAL; + } + option_len = 0; + key->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT; + option_len = fl_set_pfcp_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG_MOD(extack, "Key and mask miss aligned"); + return -EINVAL; + } + break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; @@ -3117,6 +3194,32 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_pfcp_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct pfcp_metadata *md; + struct nlattr *nest; + + nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_PFCP); + if (!nest) + goto nla_put_failure; + + md = (struct pfcp_metadata *)&enc_opts->data[0]; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, md->type)) + goto nla_put_failure; + + if (nla_put_be64(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID, + md->seid, 0)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fl_dump_key_ct(struct sk_buff *skb, struct flow_dissector_key_ct *key, struct flow_dissector_key_ct *mask) @@ -3202,26 +3305,31 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, goto nla_put_failure; switch (enc_opts->dst_opt_type) { - case TUNNEL_GENEVE_OPT: + case IP_TUNNEL_GENEVE_OPT_BIT: err = fl_dump_key_geneve_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_VXLAN_OPT: + case IP_TUNNEL_VXLAN_OPT_BIT: err = fl_dump_key_vxlan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_ERSPAN_OPT: + case IP_TUNNEL_ERSPAN_OPT_BIT: err = fl_dump_key_erspan_opt(skb, enc_opts); if (err) goto nla_put_failure; break; - case TUNNEL_GTP_OPT: + case IP_TUNNEL_GTP_OPT_BIT: err = fl_dump_key_gtp_opt(skb, enc_opts); if (err) goto nla_put_failure; break; + case IP_TUNNEL_PFCP_OPT_BIT: + err = fl_dump_key_pfcp_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; default: goto nla_put_failure; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 60239378d43f..74afc210527d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1334,7 +1334,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { - dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } @@ -1389,6 +1389,7 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: + lockdep_unregister_key(&sch->root_lock_key); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index edee926ccde8..9602dafe32e6 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1512,7 +1512,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) if (!q->overflow_timeout) { int i; /* Build fresh max-heap */ - for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) cake_heapify(q, i); } q->overflow_timeout = 65535; @@ -2572,6 +2572,8 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; + u16 rate_flags; + u8 flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, @@ -2579,10 +2581,11 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) - q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; - q->flow_mode |= CAKE_FLOW_NAT_FLAG * + flow_mode &= ~CAKE_FLOW_NAT_FLAG; + flow_mode |= CAKE_FLOW_NAT_FLAG * !!nla_get_u32(tb[TCA_CAKE_NAT]); #else NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], @@ -2592,29 +2595,34 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_BASE_RATE64]) - q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + WRITE_ONCE(q->rate_bps, + nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); if (tb[TCA_CAKE_DIFFSERV_MODE]) - q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + WRITE_ONCE(q->tin_mode, + nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); + rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) - q->rate_flags |= CAKE_FLAG_WASH; + rate_flags |= CAKE_FLAG_WASH; else - q->rate_flags &= ~CAKE_FLAG_WASH; + rate_flags &= ~CAKE_FLAG_WASH; } if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); if (tb[TCA_CAKE_ATM]) - q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + WRITE_ONCE(q->atm_mode, + nla_get_u32(tb[TCA_CAKE_ATM])); if (tb[TCA_CAKE_OVERHEAD]) { - q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); - q->rate_flags |= CAKE_FLAG_OVERHEAD; + WRITE_ONCE(q->rate_overhead, + nla_get_s32(tb[TCA_CAKE_OVERHEAD])); + rate_flags |= CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2623,7 +2631,7 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_RAW]) { - q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + rate_flags &= ~CAKE_FLAG_OVERHEAD; q->max_netlen = 0; q->max_adjlen = 0; @@ -2632,54 +2640,58 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_CAKE_MPU]) - q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + WRITE_ONCE(q->rate_mpu, + nla_get_u32(tb[TCA_CAKE_MPU])); if (tb[TCA_CAKE_RTT]) { - q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); - if (!q->interval) - q->interval = 1; + WRITE_ONCE(q->interval, max(interval, 1U)); } if (tb[TCA_CAKE_TARGET]) { - q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); - if (!q->target) - q->target = 1; + WRITE_ONCE(q->target, max(target, 1U)); } if (tb[TCA_CAKE_AUTORATE]) { if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; } if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) - q->rate_flags |= CAKE_FLAG_INGRESS; + rate_flags |= CAKE_FLAG_INGRESS; else - q->rate_flags &= ~CAKE_FLAG_INGRESS; + rate_flags &= ~CAKE_FLAG_INGRESS; } if (tb[TCA_CAKE_ACK_FILTER]) - q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + WRITE_ONCE(q->ack_filter, + nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); if (tb[TCA_CAKE_MEMORY]) - q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + WRITE_ONCE(q->buffer_config_limit, + nla_get_u32(tb[TCA_CAKE_MEMORY])); if (tb[TCA_CAKE_SPLIT_GSO]) { if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + rate_flags |= CAKE_FLAG_SPLIT_GSO; else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } if (tb[TCA_CAKE_FWMARK]) { - q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); - q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; + WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); + WRITE_ONCE(q->fwmark_shft, + q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); } + WRITE_ONCE(q->rate_flags, rate_flags); + WRITE_ONCE(q->flow_mode, flow_mode); if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2774,68 +2786,72 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) { struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; + u16 rate_flags; + u8 flow_mode; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; - if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, - TCA_CAKE_PAD)) + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, + READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, - q->flow_mode & CAKE_FLOW_MASK)) + flow_mode = READ_ONCE(q->flow_mode); + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + if (nla_put_u32(skb, TCA_CAKE_MEMORY, + READ_ONCE(q->buffer_config_limit))) goto nla_put_failure; + rate_flags = READ_ONCE(q->rate_flags); if (nla_put_u32(skb, TCA_CAKE_AUTORATE, - !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_INGRESS, - !!(q->rate_flags & CAKE_FLAG_INGRESS))) + !!(rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_NAT, - !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + !!(flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_WASH, - !!(q->rate_flags & CAKE_FLAG_WASH))) + !!(rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) goto nla_put_failure; - if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (!(rate_flags & CAKE_FLAG_OVERHEAD)) if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) goto nla_put_failure; if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, - !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; - if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) + if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 69001eff0315..939425da1895 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -389,11 +389,11 @@ static int cbs_change(struct Qdisc *sch, struct nlattr *opt, } /* Everything went OK, save the parameters used. */ - q->hicredit = qopt->hicredit; - q->locredit = qopt->locredit; - q->idleslope = qopt->idleslope * BYTES_PER_KBIT; - q->sendslope = qopt->sendslope * BYTES_PER_KBIT; - q->offload = qopt->offload; + WRITE_ONCE(q->hicredit, qopt->hicredit); + WRITE_ONCE(q->locredit, qopt->locredit); + WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT); + WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT); + WRITE_ONCE(q->offload, qopt->offload); return 0; } @@ -459,11 +459,11 @@ static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.hicredit = q->hicredit; - opt.locredit = q->locredit; - opt.sendslope = div64_s64(q->sendslope, BYTES_PER_KBIT); - opt.idleslope = div64_s64(q->idleslope, BYTES_PER_KBIT); - opt.offload = q->offload; + opt.hicredit = READ_ONCE(q->hicredit); + opt.locredit = READ_ONCE(q->locredit); + opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT); + opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT); + opt.offload = READ_ONCE(q->offload); if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ea108030c6b4..91072010923d 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -405,8 +405,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, } else sch_tree_lock(sch); - q->flags = ctl->flags; - q->limit = ctl->limit; + WRITE_ONCE(q->flags, ctl->flags); + WRITE_ONCE(q->limit, ctl->limit); red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, @@ -431,15 +431,16 @@ static int choke_init(struct Qdisc *sch, struct nlattr *opt, static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) { struct choke_sched_data *q = qdisc_priv(sch); + u8 Wlog = READ_ONCE(q->parms.Wlog); struct nlattr *opts = NULL; struct tc_red_qopt opt = { - .limit = q->limit, - .flags = q->flags, - .qth_min = q->parms.qth_min >> q->parms.Wlog, - .qth_max = q->parms.qth_max >> q->parms.Wlog, - .Wlog = q->parms.Wlog, - .Plog = q->parms.Plog, - .Scell_log = q->parms.Scell_log, + .limit = READ_ONCE(q->limit), + .flags = READ_ONCE(q->flags), + .qth_min = READ_ONCE(q->parms.qth_min) >> Wlog, + .qth_max = READ_ONCE(q->parms.qth_max) >> Wlog, + .Wlog = Wlog, + .Plog = READ_ONCE(q->parms.Plog), + .Scell_log = READ_ONCE(q->parms.Scell_log), }; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -447,7 +448,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || - nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index ecb3f164bb25..3e8d4fe4d91e 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -118,26 +118,31 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CODEL_TARGET]) { u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); - q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.target, + ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_CODEL_CE_THRESHOLD]); - q->params.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_INTERVAL]) { u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); - q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->params.interval, + ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_CODEL_LIMIT])); if (tb[TCA_CODEL_ECN]) - q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + WRITE_ONCE(q->params.ecn, + !!nla_get_u32(tb[TCA_CODEL_ECN])); qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { @@ -183,6 +188,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -190,17 +196,18 @@ static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_CODEL_TARGET, - codel_time_to_us(q->params.target)) || + codel_time_to_us(READ_ONCE(q->params.target))) || nla_put_u32(skb, TCA_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_CODEL_INTERVAL, - codel_time_to_us(q->params.interval)) || + codel_time_to_us(READ_ONCE(q->params.interval))) || nla_put_u32(skb, TCA_CODEL_ECN, - q->params.ecn)) + READ_ONCE(q->params.ecn))) goto nla_put_failure; - if (q->params.ce_threshold != CODEL_DISABLED_THRESHOLD && + ce_threshold = READ_ONCE(q->params.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD && nla_put_u32(skb, TCA_CODEL_CE_THRESHOLD, - codel_time_to_us(q->params.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 2e4bef713b6a..c74d778c32a1 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -467,15 +467,15 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) if (!nest) goto nla_put_failure; - opt.delta = q->delta; - opt.clockid = q->clockid; - if (q->offload) + opt.delta = READ_ONCE(q->delta); + opt.clockid = READ_ONCE(q->clockid); + if (READ_ONCE(q->offload)) opt.flags |= TC_ETF_OFFLOAD_ON; - if (q->deadline_mode) + if (READ_ONCE(q->deadline_mode)) opt.flags |= TC_ETF_DEADLINE_MODE_ON; - if (q->skip_sock_check) + if (READ_ONCE(q->skip_sock_check)) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 835b4460b448..f80bc05d4c5a 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -646,7 +646,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); - q->nbands = nbands; + WRITE_ONCE(q->nbands, nbands); for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); @@ -658,11 +658,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, list_del(&q->classes[i].alist); qdisc_tree_flush_backlog(q->classes[i].qdisc); } - q->nstrict = nstrict; + WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); for (i = 0; i < q->nbands; i++) - q->classes[i].quantum = quanta[i]; + WRITE_ONCE(q->classes[i].quantum, quanta[i]); for (i = oldbands; i < q->nbands; i++) { q->classes[i].qdisc = queues[i]; @@ -676,7 +676,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); q->classes[i].qdisc = NULL; - q->classes[i].quantum = 0; + WRITE_ONCE(q->classes[i].quantum, 0); q->classes[i].deficit = 0; gnet_stats_basic_sync_init(&q->classes[i].bstats); memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); @@ -733,6 +733,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) struct ets_sched *q = qdisc_priv(sch); struct nlattr *opts; struct nlattr *nest; + u8 nbands, nstrict; int band; int prio; int err; @@ -745,21 +746,22 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) if (!opts) goto nla_err; - if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) + nbands = READ_ONCE(q->nbands); + if (nla_put_u8(skb, TCA_ETS_NBANDS, nbands)) goto nla_err; - if (q->nstrict && - nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) + nstrict = READ_ONCE(q->nstrict); + if (nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, nstrict)) goto nla_err; - if (q->nbands > q->nstrict) { + if (nbands > nstrict) { nest = nla_nest_start(skb, TCA_ETS_QUANTA); if (!nest) goto nla_err; - for (band = q->nstrict; band < q->nbands; band++) { + for (band = nstrict; band < nbands; band++) { if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, - q->classes[band].quantum)) + READ_ONCE(q->classes[band].quantum))) goto nla_err; } @@ -771,7 +773,8 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_err; for (prio = 0; prio <= TC_PRIO_MAX; prio++) { - if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) + if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, + READ_ONCE(q->prio2band[prio]))) goto nla_err; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 450f5c67ac49..b50b2c2cc09b 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,7 +19,8 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit)) + if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= + READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -28,7 +29,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(sch->q.qlen < sch->limit)) + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -39,7 +40,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; - if (likely(sch->q.qlen < sch->limit)) + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; @@ -105,14 +106,14 @@ static int __fifo_init(struct Qdisc *sch, struct nlattr *opt, if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); - sch->limit = limit; + WRITE_ONCE(sch->limit, limit); } else { struct tc_fifo_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); } if (is_bfifo) @@ -154,7 +155,7 @@ static void fifo_destroy(struct Qdisc *sch) static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct tc_fifo_qopt opt = { .limit = sch->limit }; + struct tc_fifo_qopt opt = { .limit = READ_ONCE(sch->limit) }; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index cdf23ff16f40..238974725679 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -106,6 +106,8 @@ struct fq_perband_flows { int quantum; /* based on band nr : 576KB, 192KB, 64KB */ }; +#define FQ_PRIO2BAND_CRUMB_SIZE ((TC_PRIO_MAX + 1) >> 2) + struct fq_sched_data { /* Read mostly cache line */ @@ -122,7 +124,7 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; - u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; + u8 prio2band[FQ_PRIO2BAND_CRUMB_SIZE]; u32 timer_slack; /* hrtimer slack in ns */ /* Read/Write fields. */ @@ -159,7 +161,7 @@ struct fq_sched_data { /* return the i-th 2-bit value ("crumb") */ static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) { - return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; + return (READ_ONCE(prio2band[prio / 4]) >> (2 * (prio & 0x3))) & 0x3; } /* @@ -888,7 +890,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); q->fq_root = array; - q->fq_trees_log = log; + WRITE_ONCE(q->fq_trees_log, log); sch_tree_unlock(sch); @@ -927,11 +929,15 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) { const int num_elems = TC_PRIO_MAX + 1; + u8 tmp[FQ_PRIO2BAND_CRUMB_SIZE]; int i; - memset(out, 0, num_elems / 4); + memset(tmp, 0, sizeof(tmp)); for (i = 0; i < num_elems; i++) - out[i / 4] |= in[i] << (2 * (i & 0x3)); + tmp[i / 4] |= in[i] << (2 * (i & 0x3)); + + for (i = 0; i < FQ_PRIO2BAND_CRUMB_SIZE; i++) + WRITE_ONCE(out[i], tmp[i]); } static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) @@ -958,7 +964,7 @@ static int fq_load_weights(struct fq_sched_data *q, } } for (i = 0; i < FQ_BANDS; i++) - q->band_flows[i].quantum = weights[i]; + WRITE_ONCE(q->band_flows[i].quantum, weights[i]); return 0; } @@ -1011,16 +1017,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = -EINVAL; } if (tb[TCA_FQ_PLIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_PLIMIT])); if (tb[TCA_FQ_FLOW_PLIMIT]) - q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + WRITE_ONCE(q->flow_plimit, + nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT])); if (tb[TCA_FQ_QUANTUM]) { u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); if (quantum > 0 && quantum <= (1 << 20)) { - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); } else { NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); err = -EINVAL; @@ -1028,7 +1036,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_FQ_INITIAL_QUANTUM]) - q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + WRITE_ONCE(q->initial_quantum, + nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM])); if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", @@ -1037,17 +1046,19 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_MAX_RATE]) { u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); - q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; + WRITE_ONCE(q->flow_max_rate, + (rate == ~0U) ? ~0UL : rate); } if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) - q->low_rate_threshold = - nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + WRITE_ONCE(q->low_rate_threshold, + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD])); if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); if (enable <= 1) - q->rate_enable = enable; + WRITE_ONCE(q->rate_enable, + enable); else err = -EINVAL; } @@ -1055,7 +1066,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; - q->flow_refill_delay = usecs_to_jiffies(usecs_delay); + WRITE_ONCE(q->flow_refill_delay, + usecs_to_jiffies(usecs_delay)); } if (!err && tb[TCA_FQ_PRIOMAP]) @@ -1065,21 +1077,26 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack); if (tb[TCA_FQ_ORPHAN_MASK]) - q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); + WRITE_ONCE(q->orphan_mask, + nla_get_u32(tb[TCA_FQ_ORPHAN_MASK])); if (tb[TCA_FQ_CE_THRESHOLD]) - q->ce_threshold = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + WRITE_ONCE(q->ce_threshold, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_CE_THRESHOLD])); if (tb[TCA_FQ_TIMER_SLACK]) - q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + WRITE_ONCE(q->timer_slack, + nla_get_u32(tb[TCA_FQ_TIMER_SLACK])); if (tb[TCA_FQ_HORIZON]) - q->horizon = (u64)NSEC_PER_USEC * - nla_get_u32(tb[TCA_FQ_HORIZON]); + WRITE_ONCE(q->horizon, + (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON])); if (tb[TCA_FQ_HORIZON_DROP]) - q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + WRITE_ONCE(q->horizon_drop, + nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); if (!err) { @@ -1160,13 +1177,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); - u64 ce_threshold = q->ce_threshold; struct tc_prio_qopt prio = { .bands = FQ_BANDS, }; - u64 horizon = q->horizon; struct nlattr *opts; + u64 ce_threshold; s32 weights[3]; + u64 horizon; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) @@ -1174,35 +1191,48 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ + ce_threshold = READ_ONCE(q->ce_threshold); do_div(ce_threshold, NSEC_PER_USEC); + + horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); - if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || - nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || - nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + if (nla_put_u32(skb, TCA_FQ_PLIMIT, + READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, + READ_ONCE(q->flow_plimit)) || + nla_put_u32(skb, TCA_FQ_QUANTUM, + READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, + READ_ONCE(q->initial_quantum)) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, + READ_ONCE(q->rate_enable)) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, - min_t(unsigned long, q->flow_max_rate, ~0U)) || + min_t(unsigned long, + READ_ONCE(q->flow_max_rate), ~0U)) || nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, - jiffies_to_usecs(q->flow_refill_delay)) || - nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + jiffies_to_usecs(READ_ONCE(q->flow_refill_delay))) || + nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, + READ_ONCE(q->orphan_mask)) || nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, - q->low_rate_threshold) || + READ_ONCE(q->low_rate_threshold)) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, + READ_ONCE(q->fq_trees_log)) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, + READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || - nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, + READ_ONCE(q->horizon_drop))) goto nla_put_failure; fq_prio2band_decompress_crumb(q->prio2band, prio.priomap); if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio)) goto nla_put_failure; - weights[0] = q->band_flows[0].quantum; - weights[1] = q->band_flows[1].quantum; - weights[2] = q->band_flows[2].quantum; + weights[0] = READ_ONCE(q->band_flows[0].quantum); + weights[1] = READ_ONCE(q->band_flows[1].quantum); + weights[2] = READ_ONCE(q->band_flows[2].quantum); if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights)) goto nla_put_failure; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 79f9d6de6c85..4f908c11ba95 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -396,40 +396,49 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_TARGET]) { u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); - q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.target, + (target * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); - q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.ce_threshold, + (val * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) - q->cparams.ce_threshold_selector = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]); + WRITE_ONCE(q->cparams.ce_threshold_selector, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR])); if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) - q->cparams.ce_threshold_mask = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]); + WRITE_ONCE(q->cparams.ce_threshold_mask, + nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK])); if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); - q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + WRITE_ONCE(q->cparams.interval, + (interval * NSEC_PER_USEC) >> CODEL_SHIFT); } if (tb[TCA_FQ_CODEL_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + WRITE_ONCE(sch->limit, + nla_get_u32(tb[TCA_FQ_CODEL_LIMIT])); if (tb[TCA_FQ_CODEL_ECN]) - q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + WRITE_ONCE(q->cparams.ecn, + !!nla_get_u32(tb[TCA_FQ_CODEL_ECN])); if (quantum) - q->quantum = quantum; + WRITE_ONCE(q->quantum, quantum); if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) - q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + WRITE_ONCE(q->drop_batch_size, + max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]))); if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) - q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); + WRITE_ONCE(q->memory_limit, + min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]))); while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { @@ -522,6 +531,7 @@ init_failure: static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_codel_sched_data *q = qdisc_priv(sch); + codel_time_t ce_threshold; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -529,30 +539,33 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, - codel_time_to_us(q->cparams.target)) || + codel_time_to_us(READ_ONCE(q->cparams.target))) || nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, - sch->limit) || + READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, - codel_time_to_us(q->cparams.interval)) || + codel_time_to_us(READ_ONCE(q->cparams.interval))) || nla_put_u32(skb, TCA_FQ_CODEL_ECN, - q->cparams.ecn) || + READ_ONCE(q->cparams.ecn)) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, - q->quantum) || + READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, - q->drop_batch_size) || + READ_ONCE(q->drop_batch_size)) || nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, - q->memory_limit) || + READ_ONCE(q->memory_limit)) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, - q->flows_cnt)) + READ_ONCE(q->flows_cnt))) goto nla_put_failure; - if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD) { + ce_threshold = READ_ONCE(q->cparams.ce_threshold); + if (ce_threshold != CODEL_DISABLED_THRESHOLD) { if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, - codel_time_to_us(q->cparams.ce_threshold))) + codel_time_to_us(ce_threshold))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, q->cparams.ce_threshold_selector)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, + READ_ONCE(q->cparams.ce_threshold_selector))) goto nla_put_failure; - if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, q->cparams.ce_threshold_mask)) + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, + READ_ONCE(q->cparams.ce_threshold_mask))) goto nla_put_failure; } diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 358cf304f4c9..c38f33ff80bd 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -299,8 +299,8 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]); - q->p_params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->p_params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_FQ_PIE_FLOWS]) { if (q->flows) { @@ -322,39 +322,45 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]); /* convert to pschedtime */ - q->p_params.target = - PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->p_params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_FQ_PIE_TUPDATE]) - q->p_params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE])); + WRITE_ONCE(q->p_params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]))); if (tb[TCA_FQ_PIE_ALPHA]) - q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]); + WRITE_ONCE(q->p_params.alpha, + nla_get_u32(tb[TCA_FQ_PIE_ALPHA])); if (tb[TCA_FQ_PIE_BETA]) - q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]); + WRITE_ONCE(q->p_params.beta, + nla_get_u32(tb[TCA_FQ_PIE_BETA])); if (tb[TCA_FQ_PIE_QUANTUM]) - q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]); + WRITE_ONCE(q->quantum, nla_get_u32(tb[TCA_FQ_PIE_QUANTUM])); if (tb[TCA_FQ_PIE_MEMORY_LIMIT]) - q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]); + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT])); if (tb[TCA_FQ_PIE_ECN_PROB]) - q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]); + WRITE_ONCE(q->ecn_prob, + nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB])); if (tb[TCA_FQ_PIE_ECN]) - q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]); + WRITE_ONCE(q->p_params.ecn, + nla_get_u32(tb[TCA_FQ_PIE_ECN])); if (tb[TCA_FQ_PIE_BYTEMODE]) - q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]); + WRITE_ONCE(q->p_params.bytemode, + nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE])); if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]) - q->p_params.dq_rate_estimator = - nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->p_params.dq_rate_estimator, + nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { @@ -471,22 +477,23 @@ static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb) return -EMSGSIZE; /* convert target from pschedtime to us */ - if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) || + if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_FQ_PIE_FLOWS, READ_ONCE(q->flows_cnt)) || nla_put_u32(skb, TCA_FQ_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->p_params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->p_params.target))) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_FQ_PIE_TUPDATE, - jiffies_to_usecs(q->p_params.tupdate)) || - nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) || - nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) || - nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) || - nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) || - nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) || - nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) || + jiffies_to_usecs(READ_ONCE(q->p_params.tupdate))) || + nla_put_u32(skb, TCA_FQ_PIE_ALPHA, READ_ONCE(q->p_params.alpha)) || + nla_put_u32(skb, TCA_FQ_PIE_BETA, READ_ONCE(q->p_params.beta)) || + nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, READ_ONCE(q->ecn_prob)) || + nla_put_u32(skb, TCA_FQ_PIE_ECN, READ_ONCE(q->p_params.ecn)) || + nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, READ_ONCE(q->p_params.bytemode)) || nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR, - q->p_params.dq_rate_estimator)) + READ_ONCE(q->p_params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 4a2c763e2d11..2a637a17061b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -506,19 +506,22 @@ static void dev_watchdog(struct timer_list *t) unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; + unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); trans_start = READ_ONCE(txq->trans_start); - if (netif_xmit_stopped(txq) && - time_after(jiffies, (trans_start + - dev->watchdog_timeo))) { + if (!netif_xmit_stopped(txq)) + continue; + if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } + if (time_after(oldest_start, trans_start)) + oldest_start = trans_start; } if (unlikely(timedout_ms)) { @@ -531,7 +534,7 @@ static void dev_watchdog(struct timer_list *t) netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + + round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } @@ -945,7 +948,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); + lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); + lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -980,6 +985,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, return sch; errout1: + lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); @@ -1068,6 +1074,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) if (ops->destroy) ops->destroy(qdisc); + lockdep_unregister_key(&qdisc->root_lock_key); module_put(ops->owner); netdev_put(dev, &qdisc->dev_tracker); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 4e626df742d7..c287bf8423b4 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1174,7 +1174,8 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) } /* classification failed, try default class */ - cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), + READ_ONCE(q->defcls)), sch); if (cl == NULL || cl->level > 0) return NULL; @@ -1443,9 +1444,7 @@ hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; qopt = nla_data(opt); - sch_tree_lock(sch); - q->defcls = qopt->defcls; - sch_tree_unlock(sch); + WRITE_ONCE(q->defcls, qopt->defcls); return 0; } @@ -1525,7 +1524,7 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; - qopt.defcls = q->defcls; + qopt.defcls = READ_ONCE(q->defcls); if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 3f906df1435b..44d9efe1a96a 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -534,27 +534,31 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) - sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); + WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); - q->quantum = new_quantum; - q->hhf_non_hh_weight = new_hhf_non_hh_weight; + WRITE_ONCE(q->quantum, new_quantum); + WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) - q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); + WRITE_ONCE(q->hh_flows_limit, + nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); - q->hhf_reset_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_reset_timeout, + usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) - q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); + WRITE_ONCE(q->hhf_admit_bytes, + nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); - q->hhf_evict_timeout = usecs_to_jiffies(us); + WRITE_ONCE(q->hhf_evict_timeout, + usecs_to_jiffies(us)); } qlen = sch->q.qlen; @@ -657,15 +661,18 @@ static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || - nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || - nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || + if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || + nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, + READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, - jiffies_to_usecs(q->hhf_reset_timeout)) || - nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || + jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || + nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, + READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, - jiffies_to_usecs(q->hhf_evict_timeout)) || - nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) + jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || + nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, + READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 93e6fb56f3b5..ff3de37874e4 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1039,13 +1039,6 @@ static void htb_work_func(struct work_struct *work) rcu_read_unlock(); } -static void htb_set_lockdep_class_child(struct Qdisc *q) -{ - static struct lock_class_key child_key; - - lockdep_set_class(qdisc_lock(q), &child_key); -} - static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt) { return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt); @@ -1132,7 +1125,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, return -ENOMEM; } - htb_set_lockdep_class_child(qdisc); q->direct_qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } @@ -1468,7 +1460,6 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, } if (q->offload) { - htb_set_lockdep_class_child(new); /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ qdisc_refcount_inc(new); old_q = htb_graft_helper(dev_queue, new); @@ -1733,11 +1724,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); - if (q->offload) { - if (new_q) - htb_set_lockdep_class_child(new_q); + if (q->offload) htb_parent_to_leaf_offload(sch, dev_queue, new_q); - } } sch_tree_lock(sch); @@ -1947,13 +1935,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, classid, NULL); if (q->offload) { - if (new_q) { - htb_set_lockdep_class_child(new_q); - /* One ref for cl->leaf.q, the other for - * dev_queue->qdisc. - */ + /* One ref for cl->leaf.q, the other for dev_queue->qdisc. */ + if (new_q) qdisc_refcount_inc(new_q); - } old_q = htb_graft_helper(dev_queue, new_q); /* No qdisc_put needed. */ WARN_ON(!(old_q->flags & TCQ_F_BUILTIN)); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 225353fbb3f1..51d4013b6121 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -215,10 +215,8 @@ static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; - nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) { - if (nla_type(n) != TCA_MQPRIO_TC_ENTRY) - continue; - + nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, + nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 1764059b0635..b3dcb845b327 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -156,36 +156,38 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ - q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); + WRITE_ONCE(q->params.target, + PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC)); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) - q->params.tupdate = - usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); + WRITE_ONCE(q->params.tupdate, + usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]))); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); - q->params.limit = limit; - sch->limit = limit; + WRITE_ONCE(q->params.limit, limit); + WRITE_ONCE(sch->limit, limit); } if (tb[TCA_PIE_ALPHA]) - q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); + WRITE_ONCE(q->params.alpha, nla_get_u32(tb[TCA_PIE_ALPHA])); if (tb[TCA_PIE_BETA]) - q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); + WRITE_ONCE(q->params.beta, nla_get_u32(tb[TCA_PIE_BETA])); if (tb[TCA_PIE_ECN]) - q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); + WRITE_ONCE(q->params.ecn, nla_get_u32(tb[TCA_PIE_ECN])); if (tb[TCA_PIE_BYTEMODE]) - q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); + WRITE_ONCE(q->params.bytemode, + nla_get_u32(tb[TCA_PIE_BYTEMODE])); if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) - q->params.dq_rate_estimator = - nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); + WRITE_ONCE(q->params.dq_rate_estimator, + nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; @@ -469,17 +471,18 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, - ((u32)PSCHED_TICKS2NS(q->params.target)) / + ((u32)PSCHED_TICKS2NS(READ_ONCE(q->params.target))) / NSEC_PER_USEC) || - nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || + nla_put_u32(skb, TCA_PIE_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_PIE_TUPDATE, - jiffies_to_usecs(q->params.tupdate)) || - nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || - nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || + jiffies_to_usecs(READ_ONCE(q->params.tupdate))) || + nla_put_u32(skb, TCA_PIE_ALPHA, READ_ONCE(q->params.alpha)) || + nla_put_u32(skb, TCA_PIE_BETA, READ_ONCE(q->params.beta)) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || - nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || + nla_put_u32(skb, TCA_PIE_BYTEMODE, + READ_ONCE(q->params.bytemode)) || nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, - q->params.dq_rate_estimator)) + READ_ONCE(q->params.dq_rate_estimator))) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index e66f4afb920d..3b9245a3c767 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -608,6 +608,7 @@ static void sfq_perturbation(struct timer_list *t) struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; + int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); @@ -618,8 +619,12 @@ static void sfq_perturbation(struct timer_list *t) sfq_rehash(sch); spin_unlock(root_lock); - if (q->perturb_period) - mod_timer(&q->perturb_timer, jiffies + q->perturb_period); + /* q->perturb_period can change under us from + * sfq_change() and sfq_destroy(). + */ + period = READ_ONCE(q->perturb_period); + if (period) + mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } @@ -662,7 +667,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) q->quantum = ctl->quantum; q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); } - q->perturb_period = ctl->perturb_period * HZ; + WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); if (ctl->flows) q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { @@ -724,7 +729,7 @@ static void sfq_destroy(struct Qdisc *sch) struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - q->perturb_period = 0; + WRITE_ONCE(q->perturb_period, 0); del_timer_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index b4dd626c309c..20ff7386b74b 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -79,7 +79,9 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; - if (sch->q.qlen < sch->limit) { + + /* sch->limit can change under us from skbprio_change() */ + if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); @@ -172,7 +174,7 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; - sch->limit = ctl->limit; + WRITE_ONCE(sch->limit, ctl->limit); return 0; } @@ -200,7 +202,7 @@ static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; - opt.limit = sch->limit; + opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a0d54b422186..1ab17e8a7260 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1752,10 +1752,7 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, fp[tc] = q->fp[tc]; } - nla_for_each_nested(n, opt, rem) { - if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) - continue; - + nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 59304611dc00..8badec6d82a2 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -78,7 +78,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); - if (q->q.qlen < dev->tx_queue_len) { + if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) { __skb_queue_tail(&q->q, skb); return NET_XMIT_SUCCESS; } @@ -424,7 +424,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) } while ((q = NEXT_SLAVE(q)) != m->slaves); } - dev->mtu = new_mtu; + WRITE_ONCE(dev->mtu, new_mtu); return 0; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 24368f755ab1..f7b809c0d142 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -415,7 +415,7 @@ out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; - rt = (struct rt6_info *)dst; + rt = dst_rt6_info(dst); t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index e849f368ed91..5a7436a13b74 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -552,7 +552,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; - struct rtable *rt = (struct rtable *)t->dst; + struct rtable *rt = dst_rtable(t->dst); if (rt) { saddr->v4.sin_family = AF_INET; @@ -1085,7 +1085,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); - udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, + udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 08fdf1251f46..5adf0c0a6c1a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -38,6 +38,7 @@ #include <linux/inet.h> #include <linux/slab.h> #include <net/sock.h> +#include <net/proto_memory.h> #include <net/inet_ecn.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c67679a41044..64196b1dce1d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7119,6 +7119,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_assoc_ids *ids; + size_t ids_size; u32 num = 0; if (sctp_style(sk, TCP)) @@ -7131,11 +7132,11 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, num++; } - if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num) + ids_size = struct_size(ids, gaids_assoc_id, num); + if (len < ids_size) return -EINVAL; - len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; - + len = ids_size; ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; @@ -9276,7 +9277,7 @@ void sctp_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index f65d6f92afcb..61c6f3027e7f 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -80,8 +80,6 @@ static struct ctl_table sctp_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - - { /* sentinel */ } }; /* The following index defines are used in sctp_sysctl_net_register(). @@ -384,8 +382,6 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pf_expose_max, }, - - { /* sentinel */ } }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, @@ -597,6 +593,7 @@ static int proc_sctp_do_probe_interval(struct ctl_table *ctl, int write, int sctp_sysctl_net_register(struct net *net) { + size_t table_size = ARRAY_SIZE(sctp_net_table); struct ctl_table *table; int i; @@ -604,7 +601,7 @@ int sctp_sysctl_net_register(struct net *net) if (!table) return -ENOMEM; - for (i = 0; table[i].data; i++) + for (i = 0; i < table_size; i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; @@ -613,8 +610,7 @@ int sctp_sysctl_net_register(struct net *net) table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; net->sctp.sysctl_header = register_net_sysctl_sz(net, "net/sctp", - table, - ARRAY_SIZE(sctp_net_table)); + table, table_size); if (net->sctp.sysctl_header == NULL) { kfree(table); return -ENOMEM; @@ -624,7 +620,7 @@ int sctp_sysctl_net_register(struct net *net) void sctp_sysctl_net_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->sctp.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->sctp.sysctl_header); diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 746be3996768..ba5e6a2dd2fd 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,3 +20,16 @@ config SMC_DIAG smcss. if unsure, say Y. + +config SMC_LO + bool "SMC intra-OS shortcut with loopback-ism" + depends on SMC + default n + help + SMC_LO enables the creation of an Emulated-ISM device named + loopback-ism in SMC and makes use of it for transferring data + when communication occurs within the same OS. This helps in + convenient testing of SMC-D since loopback-ism is independent + of architecture or hardware. + + if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126a2..2c510d543058 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o +smc-$(CONFIG_SMC_LO) += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4b52b3b159c0..9389f0cfa374 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,7 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_loopback.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -177,7 +178,7 @@ static struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; -int smc_hash_sk(struct sock *sk) +static int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; @@ -191,9 +192,8 @@ int smc_hash_sk(struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(smc_hash_sk); -void smc_unhash_sk(struct sock *sk) +static void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -202,7 +202,6 @@ void smc_unhash_sk(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } -EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the @@ -1437,6 +1436,14 @@ static int smc_connect_ism(struct smc_sock *smc, } smc_conn_save_peer_info(smc, aclc); + + if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(smc); + if (rc) { + rc = SMC_CLC_DECL_MEM; /* try to fallback */ + goto connect_abort; + } + } smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); @@ -2541,6 +2548,14 @@ static void smc_listen_work(struct work_struct *work) mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); + + if (ini->is_smcd && + smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(new_smc); + if (rc) + goto out_decl; + } + smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; @@ -3557,15 +3572,23 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_loopback_init(); + if (rc) { + pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); + goto out_ib; + } + rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_ib; + goto out_lo; } static_branch_enable(&tcp_have_smc); return 0; +out_lo: + smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3603,6 +3626,7 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); + smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 3c06625ceb20..619b3bab3824 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -18,6 +18,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_ism.h" /********************************** send *************************************/ @@ -255,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn) return rc; smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + /* if local sndbuf shares the same memory region with + * peer DMB, then don't update the tx_curs_fin + * and sndbuf_space until peer has consumed the data. + */ + return 0; + /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -266,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn) smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); smc_tx_sndbuf_nonfull(smc); - return rc; + return 0; } /********************************* receive ***********************************/ @@ -323,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; - int diff_cons, diff_prod; + int diff_cons, diff_prod, diff_tx; smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); @@ -339,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, atomic_add(diff_cons, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); + + /* if local sndbuf shares the same memory region with + * peer RMB, then update tx_curs_fin and sndbuf_space + * here since peer has already consumed the data. + */ + if (conn->lgr->is_smcd && + smc_ism_support_dmb_nocopy(conn->lgr->smcd)) { + /* Calculate consumed data and + * increment free send buffer space. + */ + diff_tx = smc_curs_diff(conn->sndbuf_desc->len, + &conn->tx_curs_fin, + &conn->local_rx_ctrl.cons); + /* increase local sndbuf space and fin_curs */ + smp_mb__before_atomic(); + atomic_add(diff_tx, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, + &conn->local_rx_ctrl.cons, conn); + + smc_tx_sndbuf_nonfull(smc); + } } diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e55026c7529c..33fa787c28eb 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -853,8 +853,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; - v2_ext = &pclc->pclc_v2_ext; - smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + v2_ext = container_of(&pclc->pclc_v2_ext, + struct smc_clc_v2_extension, fixed); + smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, + struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 7cc7070b9772..467effb50cd6 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -134,12 +134,15 @@ struct smc_clc_smcd_gid_chid { */ struct smc_clc_v2_extension { - struct smc_clnt_opts_area_hdr hdr; - u8 roce[16]; /* RoCEv2 GID */ - u8 max_conns; - u8 max_links; - __be16 feature_mask; - u8 reserved[12]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_v2_extension_fixed, fixed, + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + __be16 feature_mask; + u8 reserved[12]; + ); u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -159,8 +162,11 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ }; struct smc_clc_smcd_v2_extension { - u8 system_eid[SMC_MAX_EID_LEN]; - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + ); struct smc_clc_smcd_gid_chid gidchid[]; }; @@ -183,9 +189,9 @@ struct smc_clc_msg_proposal_area { struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_v2_extension pclc_v2_ext; + struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; - struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9b84d5897aa5..fafdb97adfad 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1149,6 +1149,20 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, } } +static void smcd_buf_detach(struct smc_connection *conn) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + + if (!conn->sndbuf_desc) + return; + + smc_ism_detach_dmb(smcd, peer_token); + + kfree(conn->sndbuf_desc); + conn->sndbuf_desc = NULL; +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { @@ -1192,6 +1206,8 @@ void smc_conn_free(struct smc_connection *conn) if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(lgr->smcd)) + smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); @@ -1445,6 +1461,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else @@ -2464,12 +2482,18 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) int rc; /* create send buffer */ + if (is_smcd && + smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) + goto create_rmb; + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; + +create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) { + if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); list_del(&smc->conn.sndbuf_desc->list); up_write(&smc->conn.lgr->sndbufs_lock); @@ -2479,6 +2503,41 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) return rc; } +int smcd_buf_attach(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + struct smc_buf_desc *buf_desc; + int rc; + + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return -ENOMEM; + + /* The ghost sndbuf_desc describes the same memory region as + * peer RMB. Its lifecycle is consistent with the connection's + * and it will be freed with the connections instead of the + * link group. + */ + rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); + if (rc) + goto free; + + smc->sk.sk_sndbuf = buf_desc->len; + buf_desc->cpu_addr = + (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); + buf_desc->len -= sizeof(struct smcd_cdc_msg); + conn->sndbuf_desc = buf_desc; + conn->sndbuf_desc->used = 1; + atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); + return 0; + +free: + kfree(buf_desc); + return rc; +} + static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1f175376037b..d93cf51dbd7c 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -557,6 +557,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); +int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ac88de2a06a0..84f98e18c7db 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -91,6 +91,11 @@ bool smc_ism_is_v2_capable(void) return smc_ism_v2_capable; } +void smc_ism_set_v2_capable(void) +{ + smc_ism_v2_capable = true; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -126,6 +131,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->add_vlan_id) + return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); @@ -171,6 +178,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->del_vlan_id) + return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { @@ -222,7 +231,6 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { -#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -231,7 +239,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -240,9 +248,46 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; -#else - return 0; -#endif +} + +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) +{ + /* for now only loopback-ism supports + * merging sndbuf with peer DMB to avoid + * data copies between them. + */ + return (smcd->ops->support_dmb_nocopy && + smcd->ops->support_dmb_nocopy(smcd)); +} + +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc = 0; + + if (!dev->ops->attach_dmb) + return -EINVAL; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = token; + rc = dev->ops->attach_dmb(dev, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) +{ + if (!dev->ops->detach_dmb) + return -EINVAL; + + return dev->ops->detach_dmb(dev, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -322,6 +367,8 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; + if (smc_ism_is_loopback(smcd)) + goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: @@ -372,7 +419,8 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - if (ev_info.code == ISM_EVENT_REQUEST) { + if (ev_info.code == ISM_EVENT_REQUEST && + wrk->smcd->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, @@ -436,7 +484,7 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); - struct smcd_dev *smcd; + struct smcd_dev *smcd, *fentry; if (!ops) return; @@ -446,20 +494,28 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd) return; smcd->priv = ism; + smcd->client = &smc_ism_client; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); + if (smcd->ops->supports_v2()) + smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); - if (list_empty(&smcd_dev_list.list)) { - if (smcd->ops->supports_v2()) - smc_ism_v2_capable = true; - } - /* sort list: devices without pnetid before devices with pnetid */ - if (smcd->pnetid[0]) + /* sort list: + * - devices without pnetid before devices with pnetid; + * - loopback-ism always at the very beginning; + */ + if (!smcd->pnetid[0]) { + fentry = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (fentry && smc_ism_is_loopback(fentry)) + list_add(&smcd->list, &fentry->list); + else + list_add(&smcd->list, &smcd_dev_list.list); + } else { list_add_tail(&smcd->list, &smcd_dev_list.list); - else - list_add(&smcd->list, &smcd_dev_list.list); + } mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", @@ -541,6 +597,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) if (lgr->peer_shutdown) return 0; + if (!lgr->smcd->ops->signal_event) + return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 165cd013404b..6763133dd8d0 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -48,10 +48,15 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc); +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token); int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); +void smc_ism_set_v2_capable(void); int smc_ism_init(void); void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); @@ -84,4 +89,9 @@ static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) return __smc_ism_is_emulated(chid); } +static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +{ + return (smcd->ops->get_chid(smcd) == 0xFFFF); +} + #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c new file mode 100644 index 000000000000..3c5f64ca4115 --- /dev/null +++ b/net/smc/smc_loopback.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * Functions for loopback-ism device. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/device.h> +#include <linux/types.h> +#include <net/smc.h> + +#include "smc_cdc.h" +#include "smc_ism.h" +#include "smc_loopback.h" + +#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ +#define SMC_LO_SUPPORT_NOCOPY 0x1 +#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) + +static const char smc_lo_dev_name[] = "loopback-ism"; +static struct smc_lo_dev *lo_dev; + +static void smc_lo_generate_ids(struct smc_lo_dev *ldev) +{ + struct smcd_gid *lgid = &ldev->local_gid; + uuid_t uuid; + + uuid_gen(&uuid); + memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); + memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), + sizeof(lgid->gid_ext)); + + ldev->chid = SMC_LO_RESERVED_CHID; +} + +static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, + u32 vid_valid, u32 vid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + /* rgid should be the same as lgid */ + if (!ldev || rgid->gid != ldev->local_gid.gid || + rgid->gid_ext != ldev->local_gid.gid_ext) + return -ENETUNREACH; + return 0; +} + +static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, + void *client_priv) +{ + struct smc_lo_dmb_node *dmb_node, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + int sba_idx, rc; + + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == SMC_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, + struct smc_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kvfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) +{ + return SMC_LO_SUPPORT_NOCOPY; +} + +static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + struct smc_connection *conn; + + if (!sf) + /* since sndbuf is merged with peer DMB, there is + * no need to copy data from sndbuf to peer DMB. + */ + return 0; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + read_unlock_bh(&ldev->dmb_ht_lock); + + conn = smcd->conn[rmb_node->sba_idx]; + if (!conn || conn->killed) + return -EPIPE; + tasklet_schedule(&conn->rx_tsklet); + return 0; +} + +static int smc_lo_supports_v2(void) +{ + return SMC_LO_V2_CAPABLE; +} + +static void smc_lo_get_local_gid(struct smcd_dev *smcd, + struct smcd_gid *smcd_gid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + smcd_gid->gid = ldev->local_gid.gid; + smcd_gid->gid_ext = ldev->local_gid.gid_ext; +} + +static u16 smc_lo_get_chid(struct smcd_dev *smcd) +{ + return ((struct smc_lo_dev *)smcd->priv)->chid; +} + +static struct device *smc_lo_get_dev(struct smcd_dev *smcd) +{ + return &((struct smc_lo_dev *)smcd->priv)->dev; +} + +static const struct smcd_ops lo_ops = { + .query_remote_gid = smc_lo_query_rgid, + .register_dmb = smc_lo_register_dmb, + .unregister_dmb = smc_lo_unregister_dmb, + .support_dmb_nocopy = smc_lo_support_dmb_nocopy, + .attach_dmb = smc_lo_attach_dmb, + .detach_dmb = smc_lo_detach_dmb, + .add_vlan_id = NULL, + .del_vlan_id = NULL, + .set_vlan_required = NULL, + .reset_vlan_required = NULL, + .signal_event = NULL, + .move_data = smc_lo_move_data, + .supports_v2 = smc_lo_supports_v2, + .get_local_gid = smc_lo_get_local_gid, + .get_chid = smc_lo_get_chid, + .get_dev = smc_lo_get_dev, +}; + +static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, + int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) + goto out_smcd; + + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); + INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); + return smcd; + +out_smcd: + kfree(smcd); + return NULL; +} + +static int smcd_lo_register_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd; + + smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); + if (!smcd) + return -ENOMEM; + ldev->smcd = smcd; + smcd->priv = ldev; + smc_ism_set_v2_capable(); + mutex_lock(&smcd_dev_list.mutex); + list_add(&smcd->list, &smcd_dev_list.list); + mutex_unlock(&smcd_dev_list.mutex); + pr_warn_ratelimited("smc: adding smcd device %s\n", + dev_name(&ldev->dev)); + return 0; +} + +static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd = ldev->smcd; + + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ldev->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); + mutex_lock(&smcd_dev_list.mutex); + list_del_init(&smcd->list); + mutex_unlock(&smcd_dev_list.mutex); + kfree(smcd->conn); + kfree(smcd); +} + +static int smc_lo_dev_init(struct smc_lo_dev *ldev) +{ + smc_lo_generate_ids(ldev); + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); + + return smcd_lo_register_dev(ldev); +} + +static void smc_lo_dev_exit(struct smc_lo_dev *ldev) +{ + smcd_lo_unregister_dev(ldev); + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + +static void smc_lo_dev_release(struct device *dev) +{ + struct smc_lo_dev *ldev = + container_of(dev, struct smc_lo_dev, dev); + + kfree(ldev); +} + +static int smc_lo_dev_probe(void) +{ + struct smc_lo_dev *ldev; + int ret; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) + return -ENOMEM; + + ldev->dev.parent = NULL; + ldev->dev.release = smc_lo_dev_release; + device_initialize(&ldev->dev); + dev_set_name(&ldev->dev, smc_lo_dev_name); + + ret = smc_lo_dev_init(ldev); + if (ret) + goto free_dev; + + lo_dev = ldev; /* global loopback device */ + return 0; + +free_dev: + put_device(&ldev->dev); + return ret; +} + +static void smc_lo_dev_remove(void) +{ + if (!lo_dev) + return; + + smc_lo_dev_exit(lo_dev); + put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ +} + +int smc_loopback_init(void) +{ + return smc_lo_dev_probe(); +} + +void smc_loopback_exit(void) +{ + smc_lo_dev_remove(); +} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h new file mode 100644 index 000000000000..6dd4292dae56 --- /dev/null +++ b/net/smc/smc_loopback.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * SMC-D loopback-ism device structure definitions. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_LOOPBACK_H +#define _SMC_LOOPBACK_H + +#include <linux/device.h> +#include <linux/err.h> +#include <net/smc.h> + +#if IS_ENABLED(CONFIG_SMC_LO) +#define SMC_LO_MAX_DMBS 5000 +#define SMC_LO_DMBS_HASH_BITS 12 +#define SMC_LO_RESERVED_CHID 0xFFFF + +struct smc_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; + +struct smc_lo_dev { + struct smcd_dev *smcd; + struct device dev; + u16 chid; + struct smcd_gid local_gid; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; +}; + +int smc_loopback_init(void); +void smc_loopback_exit(void); +#else +static inline int smc_loopback_init(void) +{ + return 0; +} + +static inline void smc_loopback_exit(void) +{ +} +#endif + +#endif /* _SMC_LOOPBACK_H */ diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d161..f0cbe77a80b4 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a5946d1b9d60..13f2bc092db1 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -90,11 +90,11 @@ static struct ctl_table smc_table[] = { .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, - { } }; int __net_init smc_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; @@ -105,12 +105,12 @@ int __net_init smc_sysctl_net_init(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, - ARRAY_SIZE(smc_table)); + table_size); if (!net->smc.smc_hdr) goto err_reg; @@ -133,7 +133,7 @@ err_alloc: void __net_exit smc_sysctl_net_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 93941ab12549..5f3170a1c9bb 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -160,7 +160,6 @@ static struct ctl_table debug_table[] = { .mode = 0444, .proc_handler = proc_do_xprt, }, - { } }; void diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f86970733eb0..474f7a98fe9e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -209,7 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = { .extra1 = &zero, .extra2 = &zero, }, - { }, }; static void svc_rdma_proc_cleanup(void) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 29b0562d62e7..9a8ce5df83ca 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -137,7 +137,6 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; #endif diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ce18716491c8..dfc353eea8ed 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -160,7 +160,6 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; /* diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index c9189a970eec..6488ead9e464 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -244,6 +244,99 @@ static int switchdev_port_obj_notify(enum switchdev_notifier_type nt, return 0; } +static void switchdev_obj_id_to_helpful_msg(struct net_device *dev, + enum switchdev_obj_id obj_id, + int err, bool add) +{ + const char *action = add ? "add" : "del"; + const char *reason = ""; + const char *problem; + const char *obj_str; + + switch (obj_id) { + case SWITCHDEV_OBJ_ID_UNDEFINED: + obj_str = "Undefined object"; + problem = "Attempted operation is undefined, indicating a possible programming\n" + "error.\n"; + break; + case SWITCHDEV_OBJ_ID_PORT_VLAN: + obj_str = "VLAN entry"; + problem = "Failure in VLAN settings on this port might disrupt network\n" + "segmentation or traffic isolation, affecting network partitioning.\n"; + break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + obj_str = "Port Multicast Database entry"; + problem = "Failure in updating the port's Multicast Database could lead to\n" + "multicast forwarding issues.\n"; + break; + case SWITCHDEV_OBJ_ID_HOST_MDB: + obj_str = "Host Multicast Database entry"; + problem = "Failure in updating the host's Multicast Database may impact multicast\n" + "group memberships or traffic delivery, affecting multicast\n" + "communication.\n"; + break; + case SWITCHDEV_OBJ_ID_MRP: + obj_str = "Media Redundancy Protocol configuration for port"; + problem = "Failure to set MRP ring ID on this port prevents communication with\n" + "the specified redundancy ring, resulting in an inability to engage\n" + "in MRP-based network operations.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_TEST_MRP: + obj_str = "MRP Test Frame Operations for port"; + problem = "Failure to generate/monitor MRP test frames may lead to inability to\n" + "assess the ring's operational integrity and fault response, hindering\n" + "proactive network management.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: + obj_str = "MRP Ring Role Configuration"; + problem = "Improper MRP ring role configuration may create conflicts in the ring,\n" + "disrupting communication for all participants, or isolate the local\n" + "system from the ring, hindering its ability to communicate with other\n" + "participants.\n"; + break; + case SWITCHDEV_OBJ_ID_RING_STATE_MRP: + obj_str = "MRP Ring State Configuration"; + problem = "Failure to correctly set the MRP ring state can result in network\n" + "loops or leave segments without communication. In a Closed state,\n" + "it maintains loop prevention by blocking one MRM port, while an Open\n" + "state activates in response to failures, changing port states to\n" + "preserve network connectivity.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_TEST_MRP: + obj_str = "MRP_InTest Frame Generation Configuration"; + problem = "Failure in managing MRP_InTest frame generation can misjudge the\n" + "interconnection ring's state, leading to incorrect blocking or\n" + "unblocking of the I/C port. This misconfiguration might result\n" + "in unintended network loops or isolate critical network segments,\n" + "compromising network integrity and reliability.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_ROLE_MRP: + obj_str = "Interconnection Ring Role Configuration"; + problem = "Failure in incorrect assignment of interconnection ring roles\n" + "(MIM/MIC) can impair the formation of the interconnection rings.\n"; + break; + case SWITCHDEV_OBJ_ID_IN_STATE_MRP: + obj_str = "Interconnection Ring State Configuration"; + problem = "Failure in updating the interconnection ring state can lead in\n" + "case of Open state to incorrect blocking or unblocking of the\n" + "I/C port, resulting in unintended network loops or isolation\n" + "of critical network\n"; + break; + default: + obj_str = "Unknown object"; + problem = "Indicating a possible programming error.\n"; + } + + switch (err) { + case -ENOSPC: + reason = "Current HW/SW setup lacks sufficient resources.\n"; + break; + } + + netdev_err(dev, "Failed to %s %s (object id=%d) with error: %pe (%d).\n%s%s\n", + action, obj_str, obj_id, ERR_PTR(err), err, problem, reason); +} + static void switchdev_port_obj_add_deferred(struct net_device *dev, const void *data) { @@ -254,8 +347,7 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, err = switchdev_port_obj_notify(SWITCHDEV_PORT_OBJ_ADD, dev, obj, NULL); if (err && err != -EOPNOTSUPP) - netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", - err, obj->id); + switchdev_obj_id_to_helpful_msg(dev, obj->id, err, true); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } @@ -304,8 +396,7 @@ static void switchdev_port_obj_del_deferred(struct net_device *dev, err = switchdev_port_obj_del_now(dev, obj); if (err && err != -EOPNOTSUPP) - netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", - err, obj->id); + switchdev_obj_id_to_helpful_msg(dev, obj->id, err, false); if (obj->complete) obj->complete(dev, err, obj->complete_priv); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7e4135db5816..798397b6811e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3565,11 +3565,8 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, rhashtable_walk_start(iter); while ((tsk = rhashtable_walk_next(iter)) != NULL) { if (IS_ERR(tsk)) { - err = PTR_ERR(tsk); - if (err == -EAGAIN) { - err = 0; + if (PTR_ERR(tsk) == -EAGAIN) continue; - } break; } diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 9fb65c988f7f..30d2e06e3d8c 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -91,7 +91,6 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - {} }; int tipc_register_sysctl(void) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index f892b0903dba..b849a3d133a0 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -174,7 +174,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { - struct rtable *rt = (struct rtable *)ndst; + struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 0cdc1f7b6b08..ce8d56a19187 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -20,6 +20,7 @@ config TLS config TLS_DEVICE bool "Transport Layer Security HW offload" depends on TLS + select SKB_DECRYPTED select SOCK_VALIDATE_XMIT select SOCK_RX_QUEUE_MAPPING default n diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index bf8ed36b1ad6..ab6e694f7bc2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -37,6 +37,7 @@ #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> +#include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 4e7228f275fa..f9e3d3d90dcf 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -33,6 +33,7 @@ #include <crypto/aead.h> #include <crypto/scatterwalk.h> #include <net/ip6_checksum.h> +#include <linux/skbuff_ref.h> #include "tls.h" diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 5df08d848b5c..77e33e1e340e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <linux/workqueue.h> #include <net/strparser.h> #include <net/tcp.h> diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index b783231668c6..305a412785f5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2147,7 +2147,6 @@ recv_end: if (ret) { if (err >= 0 || err == -EINPROGRESS) err = ret; - decrypted = 0; goto end; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9a6ad5974dff..dc1651541723 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } @@ -979,11 +979,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->inflight = 0; + u->listener = NULL; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1597,6 +1597,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1692,8 +1693,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1723,6 +1724,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); @@ -1789,81 +1791,29 @@ static inline bool too_many_unix_fds(struct task_struct *p) static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - if (too_many_unix_fds(current)) return -ETOOMANYREFS; - /* Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; + UNIXCB(skb).fp = scm->fp; + scm->fp = NULL; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; return 0; } static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); + unix_destroy_fpl(scm->fp); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); - - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1937,8 +1887,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1946,8 +1898,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0104be9d4704..1f8b8cdfcdc8 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,277 +101,493 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } -DEFINE_SPINLOCK(unix_gc_lock); +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) +{ + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + +static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; +} + +static LIST_HEAD(unix_unvisited_vertices); + +enum unix_vertex_index { + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, +}; + +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; + } + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); +} + +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); + + list_del(&edge->vertex_entry); + vertex->out_degree--; + + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); + } +} + +static void unix_free_vertices(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } +} + +static DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; -static LIST_HEAD(gc_candidates); -static LIST_HEAD(gc_inflight_list); -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *filp) +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { - struct unix_sock *u = unix_get_socket(filp); + int i = 0, j = 0; spin_lock(&unix_gc_lock); - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; + if (!fpl->count_unix) + goto out; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + + receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); } -void unix_notinflight(struct user_struct *user, struct file *filp) +void unix_del_edges(struct scm_fp_list *fpl) { - struct unix_sock *u = unix_get_socket(filp); + struct unix_sock *receiver; + int i = 0; spin_lock(&unix_gc_lock); - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); + if (!fpl->count_unix) + goto out; - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); + do { + struct unix_edge *edge = fpl->edges + i++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); + if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); spin_unlock(&unix_gc_lock); + + fpl->inflight = false; } -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +void unix_update_edges(struct unix_sock *receiver) { - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct unix_sock *u = unix_get_socket(*fp++); - - /* Ignore non-candidates, they could have been added - * to the queues after starting the garbage collection - */ - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. + */ + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); } - spin_unlock(&x->sk_receive_queue.lock); } -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +int unix_prepare_fpl(struct scm_fp_list *fpl) { - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); + struct unix_vertex *vertex; + int i; - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); + if (!fpl->count_unix) + return 0; - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } + list_add(&vertex->entry, &fpl->vertices); } -} -static void dec_inflight(struct unix_sock *usk) -{ - usk->inflight--; + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + + return 0; + +err: + unix_free_vertices(fpl); + return -ENOMEM; } -static void inc_inflight(struct unix_sock *usk) +void unix_destroy_fpl(struct scm_fp_list *fpl) { - usk->inflight++; + if (fpl->inflight) + unix_del_edges(fpl); + + kvfree(fpl->edges); + unix_free_vertices(fpl); } -static void inc_inflight_move_tail(struct unix_sock *u) +static bool unix_vertex_dead(struct unix_vertex *vertex) { - u->inflight++; + struct unix_edge *edge; + struct unix_sock *u; + long total_ref; - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over - */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; + + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } + + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; } -static bool gc_in_progress; +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; -static void __unix_gc(struct work_struct *work) +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { - struct sk_buff_head hitlist; - struct unix_sock *u, *next; - LIST_HEAD(not_cycle_list); - struct list_head cursor; + struct unix_vertex *vertex; - spin_lock(&unix_gc_lock); + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + skb_queue_splice_init(embryo_queue, hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { + skb_queue_splice_init(queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; } +#endif } + + spin_unlock(&queue->lock); } +} - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) +{ + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); + list_add(&vertex->scc_entry, &vertex_stack); + + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; + + if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); + + vertex = next_vertex; + goto next_vertex; - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + next_vertex = vertex; + vertex = edge->predecessor->vertex; - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); + if (vertex->index == vertex->scc_index) { + struct list_head scc; + bool scc_dead = true; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; + /* SCC finalised. + * + * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ + vertex->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } -#endif + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); } - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(struct sk_buff_head *hitlist) +{ + unsigned long last_index = UNIX_VERTEX_INDEX_START; + + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex, &last_index, hitlist); } - spin_unlock(&unix_gc_lock); + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - /* Here we are. Hitlist is filled. Die. */ - __skb_queue_purge(&hitlist); + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) +{ + unix_graph_maybe_cyclic = false; + + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); + } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + +static bool gc_in_progress; + +static void __unix_gc(struct work_struct *work) +{ + struct sk_buff_head hitlist; + struct sk_buff *skb; spin_lock(&unix_gc_lock); - /* All candidates should have been detached by now. */ - WARN_ON_ONCE(!list_empty(&gc_candidates)); + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); + goto skip_gc; + } - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, false); + __skb_queue_head_init(&hitlist); + + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); + + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; + } + + __skb_queue_purge(&hitlist); +skip_gc: + WRITE_ONCE(gc_in_progress, false); } static DECLARE_WORK(unix_gc_work, __unix_gc); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 3e84b31c355a..357b3e5f3847 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -19,7 +19,6 @@ static struct ctl_table unix_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { } }; int __net_init unix_sysctl_register(struct net *net) @@ -52,7 +51,7 @@ err_alloc: void unix_sysctl_unregister(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->unx.ctl->ctl_table_arg; unregister_net_sysctl_table(net->unx.ctl); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 30ff9a470813..3c0bca4238d3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8116,7 +8116,8 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) is_indoor = true; } - return regulatory_hint_indoor(is_indoor, owner_nlportid); + regulatory_hint_indoor(is_indoor, owner_nlportid); + return 0; default: return -EINVAL; } @@ -9162,6 +9163,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; size_t ie_len, size; + size_t ssids_offset, ie_offset; wiphy = &rdev->wiphy; @@ -9207,21 +9209,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) return -EINVAL; size = struct_size(request, channels, n_channels); + ssids_offset = size; size = size_add(size, array_size(sizeof(*request->ssids), n_ssids)); + ie_offset = size; size = size_add(size, ie_len); request = kzalloc(size, GFP_KERNEL); if (!request) return -ENOMEM; + request->n_channels = n_channels; if (n_ssids) - request->ssids = (void *)&request->channels[n_channels]; + request->ssids = (void *)request + ssids_offset; request->n_ssids = n_ssids; - if (ie_len) { - if (n_ssids) - request->ie = (void *)(request->ssids + n_ssids); - else - request->ie = (void *)(request->channels + n_channels); - } + if (ie_len) + request->ie = (void *)request + ie_offset; i = 0; if (scan_freqs) { @@ -16031,6 +16032,7 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) params.counter_offset_presp = offset; } + params.link_id = nl80211_link_id(info->attrs); err = rdev_color_change(rdev, dev, ¶ms); out: @@ -17433,7 +17435,8 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_color_change, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_SET_FILS_AAD, @@ -19455,7 +19458,7 @@ EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); int cfg80211_bss_color_notify(struct net_device *dev, enum nl80211_commands cmd, u8 count, - u64 color_bitmap) + u64 color_bitmap, u8 link_id) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; @@ -19478,6 +19481,10 @@ int cfg80211_bss_color_notify(struct net_device *dev, if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; + if (wdev->valid_links && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) + goto nla_put_failure; + if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED && nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count)) goto nla_put_failure; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 753f8e9aa4b1..3cef0021a3db 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3284,7 +3284,7 @@ int regulatory_hint_user(const char *alpha2, return 0; } -int regulatory_hint_indoor(bool is_indoor, u32 portid) +void regulatory_hint_indoor(bool is_indoor, u32 portid) { spin_lock(®_indoor_lock); @@ -3307,8 +3307,6 @@ int regulatory_hint_indoor(bool is_indoor, u32 portid) if (!is_indoor) reg_check_channels(); - - return 0; } void regulatory_netlink_notify(u32 portid) @@ -3666,9 +3664,9 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) return false; } -int regulatory_hint_found_beacon(struct wiphy *wiphy, - struct ieee80211_channel *beacon_chan, - gfp_t gfp) +void regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp) { struct reg_beacon *reg_beacon; bool processing; @@ -3677,18 +3675,18 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, beacon_chan->flags & IEEE80211_CHAN_RADAR || (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) - return 0; + return; spin_lock_bh(®_pending_beacons_lock); processing = pending_reg_beacon(beacon_chan); spin_unlock_bh(®_pending_beacons_lock); if (processing) - return 0; + return; reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp); if (!reg_beacon) - return -ENOMEM; + return; pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", beacon_chan->center_freq, beacon_chan->freq_offset, @@ -3708,8 +3706,6 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, spin_unlock_bh(®_pending_beacons_lock); schedule_work(®_work); - - return 0; } static void print_rd_rules(const struct ieee80211_regdomain *rd) diff --git a/net/wireless/reg.h b/net/wireless/reg.h index a02ef5609f52..e1b211c4f75c 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -42,7 +42,7 @@ int regulatory_hint_user(const char *alpha2, * device is operating in an indoor environment. * @portid: the netlink port ID on which the hint was given. */ -int regulatory_hint_indoor(bool is_indoor, u32 portid); +void regulatory_hint_indoor(bool is_indoor, u32 portid); /** * regulatory_netlink_notify - notify on released netlink socket @@ -82,9 +82,9 @@ bool reg_last_request_cell_base(void); * on a newly found BSS. If you cannot make use of this feature you can * set the wiphy->disable_beacon_hints to true. */ -int regulatory_hint_found_beacon(struct wiphy *wiphy, - struct ieee80211_channel *beacon_chan, - gfp_t gfp); +void regulatory_hint_found_beacon(struct wiphy *wiphy, + struct ieee80211_channel *beacon_chan, + gfp_t gfp); /** * regulatory_hint_country_ie - hints a country IE as a regulatory domain @@ -137,13 +137,14 @@ void regulatory_hint_disconnect(void); * Get a value specifying the U-NII band frequency belongs to. * U-NII bands are defined by the FCC in C.F.R 47 part 15. * - * Returns -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A, + * Return: -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A, * 2 for UNII-2B, 3 for UNII-2C and 4 for UNII-3. */ int cfg80211_get_unii(int freq); /** * regulatory_indoor_allowed - is indoor operation allowed + * Return: %true if indoor operation is allowed, %false otherwise */ bool regulatory_indoor_allowed(void); @@ -173,11 +174,13 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, * reg_dfs_domain_same - Checks if both wiphy have same DFS domain configured * @wiphy1: wiphy it's dfs_region to be checked against that of wiphy2 * @wiphy2: wiphy it's dfs_region to be checked against that of wiphy1 + * Return: %true if both wiphys have the same DFS domain, %false otherwise */ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); /** * reg_reload_regdb - reload the regulatory.db firmware file + * Return: 0 for success, an error code otherwise */ int reg_reload_regdb(void); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 5a5dd3ce497f..127853877a0a 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2140,11 +2140,15 @@ static bool cfg80211_6ghz_power_type_valid(const u8 *ie, size_t ielen, switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return true; case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT); case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT); + default: + return false; } } return false; @@ -2207,12 +2211,16 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.pub.use_for = data->use_for; tmp.pub.cannot_use_reasons = data->cannot_use_reasons; - if (data->bss_source != BSS_SOURCE_DIRECT) { + switch (data->bss_source) { + case BSS_SOURCE_MBSSID: tmp.pub.transmitted_bss = data->source_bss; + fallthrough; + case BSS_SOURCE_STA_PROFILE: ts = bss_from_pub(data->source_bss)->ts; tmp.pub.bssid_index = data->bssid_index; tmp.pub.max_bssid_indicator = data->max_bssid_indicator; - } else { + break; + case BSS_SOURCE_DIRECT: ts = jiffies; if (channel->band == NL80211_BAND_60GHZ) { @@ -2227,6 +2235,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, regulatory_hint_found_beacon(wiphy, channel, gfp); } + break; } /* @@ -2443,7 +2452,8 @@ cfg80211_parse_mbssid_data(struct wiphy *wiphy, profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || mbssid_index_ie[2] == 0 || - mbssid_index_ie[2] > 46) { + mbssid_index_ie[2] > 46 || + mbssid_index_ie[2] >= (1 << elem->data[0])) { /* No valid Multiple BSSID-Index element */ continue; } @@ -2655,6 +2665,7 @@ struct tbtt_info_iter_data { u8 param_ch_count; u32 use_for; u8 mld_id, link_id; + bool non_tx; }; static enum cfg80211_rnr_iter_ret @@ -2665,14 +2676,20 @@ cfg802121_mld_ap_rnr_iter(void *_data, u8 type, const struct ieee80211_rnr_mld_params *mld_params; struct tbtt_info_iter_data *data = _data; u8 link_id; + bool non_tx = false; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT && tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11, - mld_params)) - mld_params = (void *)(tbtt_info + - offsetof(struct ieee80211_tbtt_info_ge_11, - mld_params)); - else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && + mld_params)) { + const struct ieee80211_tbtt_info_ge_11 *tbtt_info_ge_11 = + (void *)tbtt_info; + + non_tx = (tbtt_info_ge_11->bss_params & + (IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID | + IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID)) == + IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID; + mld_params = &tbtt_info_ge_11->mld_params; + } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD && tbtt_info_len >= sizeof(struct ieee80211_rnr_mld_params)) mld_params = (void *)tbtt_info; else @@ -2691,6 +2708,7 @@ cfg802121_mld_ap_rnr_iter(void *_data, u8 type, data->param_ch_count = le16_get_bits(mld_params->params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); + data->non_tx = non_tx; if (type == IEEE80211_TBTT_INFO_TYPE_TBTT) data->use_for = NL80211_BSS_USE_FOR_ALL; @@ -2702,7 +2720,7 @@ cfg802121_mld_ap_rnr_iter(void *_data, u8 type, static u8 cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, const struct ieee80211_neighbor_ap_info **ap_info, - u8 *param_ch_count) + u8 *param_ch_count, bool *non_tx) { struct tbtt_info_iter_data data = { .mld_id = mld_id, @@ -2713,6 +2731,7 @@ cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id, *ap_info = data.ap_info; *param_ch_count = data.param_ch_count; + *non_tx = data.non_tx; return data.use_for; } @@ -2892,6 +2911,7 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, ssize_t profile_len; u8 param_ch_count; u8 link_id, use_for; + bool non_tx; if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i], mle->sta_prof_len[i])) @@ -2937,10 +2957,24 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, tx_data->ielen, mld_id, link_id, &ap_info, - ¶m_ch_count); + ¶m_ch_count, + &non_tx); if (!use_for) continue; + /* + * As of 802.11be_D5.0, the specification does not give us any + * way of discovering both the MaxBSSID and the Multiple-BSSID + * Index. It does seem like the Multiple-BSSID Index element + * may be provided, but section 9.4.2.45 explicitly forbids + * including a Multiple-BSSID Element (in this case without any + * subelements). + * Without both pieces of information we cannot calculate the + * reference BSSID, so simply ignore the BSS. + */ + if (non_tx) + continue; + /* We could sanity check the BSSID is included */ if (!ieee80211_operating_class_to_band(ap_info->op_class, diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 82e3ce42206c..a8ad55f11133 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1353,6 +1353,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, return; cfg80211_wdev_release_bsses(wdev); + wdev->valid_links = 0; wdev->connected = false; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index df013c98b80d..9bf987519811 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2842,6 +2842,7 @@ TRACE_EVENT(rdev_color_change, __field(u8, count) __field(u16, bcn_ofs) __field(u16, pres_ofs) + __field(u8, link_id) ), TP_fast_assign( WIPHY_ASSIGN; @@ -2849,11 +2850,12 @@ TRACE_EVENT(rdev_color_change, __entry->count = params->count; __entry->bcn_ofs = params->counter_offset_beacon; __entry->pres_ofs = params->counter_offset_presp; + __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT - ", count: %u", + ", count: %u, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->count) + __entry->count, __entry->link_id) ); TRACE_EVENT(rdev_set_radar_background, diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index e9802afa43d0..643f50874dfe 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -71,7 +71,6 @@ static struct ctl_table x25_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; int __init x25_register_sysctl(void) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 655fe4ff8621..703d4172c7d7 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -98,6 +98,7 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = { }; static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_UNSPEC] = { .strict_start_type = XFRMA_SA_DIR }, [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, @@ -129,6 +130,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), }; static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, @@ -277,9 +279,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_SET_MARK_MASK: case XFRMA_IF_ID: case XFRMA_MTIMER_THRESH: + case XFRMA_SA_DIR: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -434,7 +437,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6346690d5c69..2455a76a1cff 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -253,6 +253,12 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) || + (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) { + NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction"); + return -EINVAL; + } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support UDP encapsulation and TFC padding. */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 3a2982a72a6b..d2ea18dcb0cb 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -474,6 +474,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (encap_type < 0 || (xo && xo->flags & XFRM_GRO)) { x = xfrm_input_state(skb); + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + goto drop; + } + if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); @@ -579,6 +584,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); + xfrm_state_put(x); + goto drop; + } + skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 4df5c06e3ece..e50e4bf993fa 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -926,7 +926,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - return xi->net; + return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 53d8fabfa685..475b904fe68b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2489,6 +2489,12 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); + if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); + xfrm_state_put(x); + error = -EINVAL; + goto fail; + } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -2598,8 +2604,7 @@ static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { - struct rt6_info *rt = (struct rt6_info *)dst; - path->path_cookie = rt6_get_cookie(rt); + path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 5f9bf8e5c933..eeb984be03a7 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -41,6 +41,8 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), + SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_SENTINEL }; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index ce56d659c55a..bc56c6305725 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -778,7 +778,8 @@ int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack) } if (x->props.flags & XFRM_STATE_ESN) { - if (replay_esn->replay_window == 0) { + if (replay_esn->replay_window == 0 && + (!x->dir || x->dir == XFRM_SA_DIR_IN)) { NL_SET_ERR_MSG(extack, "ESN replay window must be > 0"); return -EINVAL; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0c306473a79d..649bb739df0d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1292,6 +1292,7 @@ found: if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; + x->dir = XFRM_SA_DIR_OUT; list_add(&x->km.all, &net->xfrm.state_all); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, @@ -1744,6 +1745,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; + x->dir = orig->dir; return x; @@ -1864,8 +1866,14 @@ int xfrm_state_update(struct xfrm_state *x) } if (x1->km.state == XFRM_STATE_ACQ) { + if (x->dir && x1->dir != x->dir) + goto out; + __xfrm_state_insert(x); x = NULL; + } else { + if (x1->dir != x->dir) + goto out; } err = 0; diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 7fdeafc838a7..ca003e8a0376 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -38,7 +38,6 @@ static struct ctl_table xfrm_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - {} }; int __net_init xfrm_sysctl_init(struct net *net) @@ -57,10 +56,8 @@ int __net_init xfrm_sysctl_init(struct net *net) table[3].data = &net->xfrm.sysctl_acq_expires; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } net->xfrm.sysctl_hdr = register_net_sysctl_sz(net, "net/core", table, table_size); @@ -76,7 +73,7 @@ out_kmemdup: void __net_exit xfrm_sysctl_fini(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->xfrm.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->xfrm.sysctl_hdr); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 810b520493f3..e83c687bd64e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -130,7 +130,7 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_a } static inline int verify_replay(struct xfrm_usersa_info *p, - struct nlattr **attrs, + struct nlattr **attrs, u8 sa_dir, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; @@ -168,6 +168,30 @@ static inline int verify_replay(struct xfrm_usersa_info *p, return -EINVAL; } + if (sa_dir == XFRM_SA_DIR_OUT) { + if (rs->replay_window) { + NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA"); + return -EINVAL; + } + if (rs->seq || rs->seq_hi) { + NL_SET_ERR_MSG(extack, + "Replay seq and seq_hi should be 0 for output SA"); + return -EINVAL; + } + if (rs->bmp_len) { + NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA"); + return -EINVAL; + } + } + + if (sa_dir == XFRM_SA_DIR_IN) { + if (rs->oseq || rs->oseq_hi) { + NL_SET_ERR_MSG(extack, + "Replay oseq and oseq_hi should be 0 for input SA"); + return -EINVAL; + } + } + return 0; } @@ -176,6 +200,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, struct netlink_ext_ack *extack) { int err; + u8 sa_dir = attrs[XFRMA_SA_DIR] ? nla_get_u8(attrs[XFRMA_SA_DIR]) : 0; err = -EINVAL; switch (p->family) { @@ -334,7 +359,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_sec_ctx_len(attrs, extack))) goto out; - if ((err = verify_replay(p, attrs, extack))) + if ((err = verify_replay(p, attrs, sa_dir, extack))) goto out; err = -EINVAL; @@ -358,6 +383,77 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; goto out; } + + if (sa_dir == XFRM_SA_DIR_OUT) { + NL_SET_ERR_MSG(extack, + "MTIMER_THRESH attribute should not be set on output SA"); + err = -EINVAL; + goto out; + } + } + + if (sa_dir == XFRM_SA_DIR_OUT) { + if (p->flags & XFRM_STATE_DECAP_DSCP) { + NL_SET_ERR_MSG(extack, "Flag DECAP_DSCP should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->flags & XFRM_STATE_ICMP) { + NL_SET_ERR_MSG(extack, "Flag ICMP should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->flags & XFRM_STATE_WILDRECV) { + NL_SET_ERR_MSG(extack, "Flag WILDRECV should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (p->replay_window) { + NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_REPLAY_VAL]) { + struct xfrm_replay_state *replay; + + replay = nla_data(attrs[XFRMA_REPLAY_VAL]); + + if (replay->seq || replay->bitmap) { + NL_SET_ERR_MSG(extack, + "Replay seq and bitmap should be 0 for output SA"); + err = -EINVAL; + goto out; + } + } + } + + if (sa_dir == XFRM_SA_DIR_IN) { + if (p->flags & XFRM_STATE_NOPMTUDISC) { + NL_SET_ERR_MSG(extack, "Flag NOPMTUDISC should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_SA_EXTRA_FLAGS]) { + u32 xflags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); + + if (xflags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) { + NL_SET_ERR_MSG(extack, "Flag DONT_ENCAP_DSCP should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (xflags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP) { + NL_SET_ERR_MSG(extack, "Flag OSEQ_MAY_WRAP should not be set for input SA"); + err = -EINVAL; + goto out; + } + + } } out: @@ -734,6 +830,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_SA_DIR]) + x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -1182,8 +1281,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } - if (x->mapping_maxage) + if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); + if (ret) + goto out; + } + if (x->dir) + ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); out: return ret; } @@ -1618,6 +1722,9 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) goto out; + if (attrs[XFRMA_SA_DIR]) + x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]); + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); if (IS_ERR(resp_skb)) { err = PTR_ERR(resp_skb); @@ -2402,7 +2509,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur)) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ - + nla_total_size(4); /* XFRM_AE_ETHR */ + + nla_total_size(4) /* XFRM_AE_ETHR */ + + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2459,6 +2567,12 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; + if (x->dir) { + err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); + if (err) + goto out_cancel; + } + nlmsg_end(skb, nlh); return 0; @@ -3018,6 +3132,7 @@ EXPORT_SYMBOL_GPL(xfrm_msg_min); #undef XMSGSIZE const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { + [XFRMA_UNSPEC] = { .strict_start_type = XFRMA_SA_DIR }, [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)}, [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)}, [XFRMA_LASTUSED] = { .type = NLA_U64}, @@ -3049,6 +3164,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, [XFRMA_IF_ID] = { .type = NLA_U32 }, [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3097,6 +3213,24 @@ static const struct xfrm_link { [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_get_default }, }; +static int xfrm_reject_unused_attr(int type, struct nlattr **attrs, + struct netlink_ext_ack *extack) +{ + if (attrs[XFRMA_SA_DIR]) { + switch (type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_UPDSA: + case XFRM_MSG_ALLOCSPI: + break; + default: + NL_SET_ERR_MSG(extack, "Invalid attribute SA_DIR"); + return -EINVAL; + } + } + + return 0; +} + static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -3156,6 +3290,12 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto err; + if (!link->nla_pol || link->nla_pol == xfrma_policy) { + err = xfrm_reject_unused_attr((type + XFRM_MSG_BASE), attrs, extack); + if (err < 0) + goto err; + } + if (link->doit == NULL) { err = -EINVAL; goto err; @@ -3189,8 +3329,9 @@ static void xfrm_netlink_rcv(struct sk_buff *skb) static inline unsigned int xfrm_expire_msgsize(void) { - return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) - + nla_total_size(sizeof(struct xfrm_mark)); + return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + + nla_total_size(sizeof(struct xfrm_mark)) + + nla_total_size(sizeof_field(struct xfrm_state, dir)); } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -3217,6 +3358,12 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; + if (x->dir) { + err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); + if (err) + return err; + } + nlmsg_end(skb, nlh); return 0; } @@ -3324,6 +3471,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->mapping_maxage) l += nla_total_size(sizeof(x->mapping_maxage)); + if (x->dir) + l += nla_total_size(sizeof(x->dir)); + return l; } |