diff options
104 files changed, 2753 insertions, 1887 deletions
diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h new file mode 100644 index 000000000000..a2b39602e87d --- /dev/null +++ b/include/linux/netfilter/nf_osf.h @@ -0,0 +1,27 @@ +#include <uapi/linux/netfilter/nf_osf.h> + +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers); diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 0773b5a032f1..c6935be7c6ca 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -17,10 +17,6 @@ #include <linux/if_ether.h> #include <uapi/linux/netfilter_bridge/ebtables.h> -/* return values for match() functions */ -#define EBT_MATCH 0 -#define EBT_NOMATCH 1 - struct ebt_match { struct list_head list; const char name[EBT_FUNCTION_MAXNAMELEN]; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 8df4ff798b04..4cf1ef935ed9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info * !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..0ac795b41ab8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -668,6 +668,7 @@ struct ip_vs_dest { volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ + atomic_t last_weight; /* server latest weight */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0a872a7c33c8..798558fd1681 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -960,8 +960,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst); - int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h index ebd869473603..cd24be4c4a99 100644 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -6,7 +6,7 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv4_register_notifier(void); diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h index 1ed4f2631ed6..0c3b5ebf0bb8 100644 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -3,7 +3,7 @@ #define _NF_NAT_MASQUERADE_IPV6_H_ unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv6_register_notifier(void); void nf_nat_masquerade_ipv6_unregister_notifier(void); diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 833752dd0c58..ba9fa4592f2b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -6,6 +6,7 @@ #include <linux/netdevice.h> #include <linux/rhashtable.h> #include <linux/rcupdate.h> +#include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/dst.h> struct nf_flowtable; @@ -13,25 +14,24 @@ struct nf_flowtable; struct nf_flowtable_type { struct list_head list; int family; - void (*gc)(struct work_struct *work); + int (*init)(struct nf_flowtable *ft); void (*free)(struct nf_flowtable *ft); - const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; }; struct nf_flowtable { + struct list_head list; struct rhashtable rhashtable; const struct nf_flowtable_type *type; struct delayed_work gc_work; }; enum flow_offload_tuple_dir { - FLOW_OFFLOAD_DIR_ORIGINAL, - FLOW_OFFLOAD_DIR_REPLY, - __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, + FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, + FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX }; -#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) struct flow_offload_tuple { union { @@ -55,6 +55,8 @@ struct flow_offload_tuple { int oifidx; + u16 mtu; + struct dst_entry *dst_cache; }; @@ -66,6 +68,7 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 +#define FLOW_OFFLOAD_TEARDOWN 0x8 struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; @@ -98,11 +101,14 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void nf_flow_offload_work_gc(struct work_struct *work); -extern const struct rhashtable_params nf_flow_offload_rhash_params; -void flow_offload_dead(struct flow_offload *flow); +void flow_offload_teardown(struct flow_offload *flow); +static inline void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 207a467e7ca6..da3d601cadee 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -39,7 +39,7 @@ struct nf_conn_nat { /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index ce7c2b4e64bb..8bad2560576f 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -7,7 +7,7 @@ struct nf_nat_l3proto { u8 l3proto; bool (*in_range)(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range); + const struct nf_nat_range2 *range); u32 (*secure_port)(const struct nf_conntrack_tuple *t, __be16); @@ -33,7 +33,7 @@ struct nf_nat_l3proto { struct flowi *fl); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; int nf_nat_l3proto_register(const struct nf_nat_l3proto *); @@ -48,30 +48,26 @@ unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -81,29 +77,25 @@ unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); #endif /* _NF_NAT_L3PROTO_H */ diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 67835ff8a2d9..b4d6b29bca62 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -34,12 +34,12 @@ struct nf_nat_l4proto { */ void (*unique_tuple)(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; /* Protocol registration. */ @@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover); int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); #endif /*_NF_NAT_L4PROTO_H*/ diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h index 5ddabb08c472..c129aacc8ae8 100644 --- a/include/net/netfilter/nf_nat_redirect.h +++ b/include/net/netfilter/nf_nat_redirect.h @@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_ipv4_multi_range_compat *mr, unsigned int hooknum); unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum); #endif /* _NF_NAT_REDIRECT_H_ */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index cd368d1b8cb8..435c9e3b9181 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -275,23 +275,6 @@ struct nft_set_estimate { enum nft_set_class space; }; -/** - * struct nft_set_type - nf_tables set type - * - * @select_ops: function to select nft_set_ops - * @ops: default ops, used when no select_ops functions is present - * @list: used internally - * @owner: module reference - */ -struct nft_set_type { - const struct nft_set_ops *(*select_ops)(const struct nft_ctx *, - const struct nft_set_desc *desc, - u32 flags); - const struct nft_set_ops *ops; - struct list_head list; - struct module *owner; -}; - struct nft_set_ext; struct nft_expr; @@ -310,7 +293,6 @@ struct nft_expr; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size - * @features: features supported by the implementation */ struct nft_set_ops { bool (*lookup)(const struct net *net, @@ -361,9 +343,23 @@ struct nft_set_ops { void (*destroy)(const struct nft_set *set); unsigned int elemsize; +}; + +/** + * struct nft_set_type - nf_tables set type + * + * @ops: set ops for this type + * @list: used internally + * @owner: module reference + * @features: features supported by the implementation + */ +struct nft_set_type { + const struct nft_set_ops ops; + struct list_head list; + struct module *owner; u32 features; - const struct nft_set_type *type; }; +#define to_set_type(o) container_of(o, struct nft_set_type, ops) int nft_register_set(struct nft_set_type *type); void nft_unregister_set(struct nft_set_type *type); @@ -589,7 +585,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } -static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } @@ -607,7 +603,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); + time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, @@ -1015,9 +1011,9 @@ static inline void *nft_obj_data(const struct nft_object *obj) #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, u32 objtype, - u8 genmask); +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); void nft_obj_notify(struct net *net, struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, @@ -1106,12 +1102,9 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask); -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data); +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index ea5aab568be8..cd6915b6c054 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -10,6 +10,9 @@ extern struct nft_expr_type nft_byteorder_type; extern struct nft_expr_type nft_payload_type; extern struct nft_expr_type nft_dynset_type; extern struct nft_expr_type nft_range_type; +extern struct nft_expr_type nft_meta_type; +extern struct nft_expr_type nft_rt_type; +extern struct nft_expr_type nft_exthdr_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h index 612cfb63ac68..ea32a7d3cf1b 100644 --- a/include/net/netfilter/nfnetlink_log.h +++ b/include/net/netfilter/nfnetlink_log.h @@ -1,18 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _KER_NFNETLINK_LOG_H -#define _KER_NFNETLINK_LOG_H - -void -nfulnl_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li_user, - const char *prefix); - -#define NFULNL_COPY_DISABLED 0xff - -#endif /* _KER_NFNETLINK_LOG_H */ - diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h deleted file mode 100644 index 5c69e9b09388..000000000000 --- a/include/net/netfilter/nft_meta.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFT_META_H_ -#define _NFT_META_H_ - -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -extern const struct nla_policy nft_meta_policy[]; - -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr); - -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); - -#endif diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index a33000da7229..4a95c0db14d4 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -10,6 +10,7 @@ #define NF_NAT_RANGE_PROTO_RANDOM (1 << 2) #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -17,7 +18,7 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) struct nf_nat_ipv4_range { unsigned int flags; @@ -40,4 +41,13 @@ struct nf_nat_range { union nf_conntrack_man_proto max_proto; }; +struct nf_nat_range2 { + unsigned int flags; + union nf_inet_addr min_addr; + union nf_inet_addr max_addr; + union nf_conntrack_man_proto min_proto; + union nf_conntrack_man_proto max_proto; + union nf_conntrack_man_proto base_proto; +}; + #endif /* _NETFILTER_NF_NAT_H */ diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h new file mode 100644 index 000000000000..45376eae31ef --- /dev/null +++ b/include/uapi/linux/netfilter/nf_osf.h @@ -0,0 +1,90 @@ +#ifndef _NF_OSF_H +#define _NF_OSF_H + +#define MAXGENRELEN 32 + +#define NF_OSF_GENRE (1 << 0) +#define NF_OSF_TTL (1 << 1) +#define NF_OSF_LOG (1 << 2) +#define NF_OSF_INVERT (1 << 3) + +#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ +#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ +#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ + +#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ + +/* Do not compare ip and fingerprint TTL at all */ +#define NF_OSF_TTL_NOCHECK 2 + +/* Wildcard MSS (kind of). + * It is used to implement a state machine for the different wildcard values + * of the MSS and window sizes. + */ +struct nf_osf_wc { + __u32 wc; + __u32 val; +}; + +/* This struct represents IANA options + * http://www.iana.org/assignments/tcp-parameters + */ +struct nf_osf_opt { + __u16 kind, length; + struct nf_osf_wc wc; +}; + +struct nf_osf_info { + char genre[MAXGENRELEN]; + __u32 len; + __u32 flags; + __u32 loglevel; + __u32 ttl; +}; + +struct nf_osf_user_finger { + struct nf_osf_wc wss; + + __u8 ttl, df; + __u16 ss, mss; + __u16 opt_num; + + char genre[MAXGENRELEN]; + char version[MAXGENRELEN]; + char subtype[MAXGENRELEN]; + + /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ + struct nf_osf_opt opt[MAX_IPOPTLEN]; +}; + +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + +struct nf_osf_nlmsg { + struct nf_osf_user_finger f; + struct iphdr ip; + struct tcphdr tcp; +}; + +/* Defines for IANA option kinds */ +enum iana_options { + OSFOPT_EOL = 0, /* End of options */ + OSFOPT_NOP, /* NOP */ + OSFOPT_MSS, /* Maximum segment size */ + OSFOPT_WSO, /* Window scale option */ + OSFOPT_SACKP, /* SACK permitted */ + OSFOPT_SACK, /* SACK */ + OSFOPT_ECHO, + OSFOPT_ECHOREPLY, + OSFOPT_TS, /* Timestamp option */ + OSFOPT_POCP, /* Partial Order Connection Permitted */ + OSFOPT_POSP, /* Partial Order Service Profile */ + + /* Others are not used in the current OSF */ + OSFOPT_EMPTY = 255, +}; + +#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6a3d653d5b27..ce031cf72288 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -831,7 +831,9 @@ enum nft_rt_keys { NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, NFT_RT_TCPMSS, + __NFT_RT_MAX }; +#define NFT_RT_MAX (__NFT_RT_MAX - 1) /** * enum nft_hash_types - nf_tables hash expression types @@ -949,7 +951,9 @@ enum nft_ct_keys { NFT_CT_DST_IP, NFT_CT_SRC_IP6, NFT_CT_DST_IP6, + __NFT_CT_MAX }; +#define NFT_CT_MAX (__NFT_CT_MAX - 1) /** * enum nft_ct_attributes - nf_tables ct expression netlink attributes @@ -1450,6 +1454,8 @@ enum nft_trace_types { * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32) + * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING) + * @NFTA_NG_SET_ID: id of the map (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, @@ -1457,6 +1463,8 @@ enum nft_ng_attributes { NFTA_NG_MODULUS, NFTA_NG_TYPE, NFTA_NG_OFFSET, + NFTA_NG_SET_NAME, + NFTA_NG_SET_ID, __NFTA_NG_MAX }; #define NFTA_NG_MAX (__NFTA_NG_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 77987111cab0..1d41810d17e2 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -262,6 +262,7 @@ enum ctattr_stats_cpu { enum ctattr_stats_global { CTA_STATS_GLOBAL_UNSPEC, CTA_STATS_GLOBAL_ENTRIES, + CTA_STATS_GLOBAL_MAX_ENTRIES, __CTA_STATS_GLOBAL_MAX, }; #define CTA_STATS_GLOBAL_MAX (__CTA_STATS_GLOBAL_MAX - 1) diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index dad197e2ab99..72956eceeb09 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -23,101 +23,29 @@ #include <linux/types.h> #include <linux/ip.h> #include <linux/tcp.h> +#include <linux/netfilter/nf_osf.h> -#define MAXGENRELEN 32 +#define XT_OSF_GENRE NF_OSF_GENRE +#define XT_OSF_INVERT NF_OSF_INVERT -#define XT_OSF_GENRE (1<<0) -#define XT_OSF_TTL (1<<1) -#define XT_OSF_LOG (1<<2) -#define XT_OSF_INVERT (1<<3) +#define XT_OSF_TTL NF_OSF_TTL +#define XT_OSF_LOG NF_OSF_LOG -#define XT_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ -#define XT_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ -#define XT_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ +#define XT_OSF_LOGLEVEL_ALL NF_OSF_LOGLEVEL_ALL +#define XT_OSF_LOGLEVEL_FIRST NF_OSF_LOGLEVEL_FIRST +#define XT_OSF_LOGLEVEL_ALL_KNOWN NF_OSF_LOGLEVEL_ALL_KNOWN -#define XT_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ -#define XT_OSF_TTL_LESS 1 /* Check if ip TTL is less than fingerprint one */ -#define XT_OSF_TTL_NOCHECK 2 /* Do not compare ip and fingerprint TTL at all */ +#define XT_OSF_TTL_TRUE NF_OSF_TTL_TRUE +#define XT_OSF_TTL_NOCHECK NF_OSF_TTL_NOCHECK -struct xt_osf_info { - char genre[MAXGENRELEN]; - __u32 len; - __u32 flags; - __u32 loglevel; - __u32 ttl; -}; - -/* - * Wildcard MSS (kind of). - * It is used to implement a state machine for the different wildcard values - * of the MSS and window sizes. - */ -struct xt_osf_wc { - __u32 wc; - __u32 val; -}; - -/* - * This struct represents IANA options - * http://www.iana.org/assignments/tcp-parameters - */ -struct xt_osf_opt { - __u16 kind, length; - struct xt_osf_wc wc; -}; - -struct xt_osf_user_finger { - struct xt_osf_wc wss; - - __u8 ttl, df; - __u16 ss, mss; - __u16 opt_num; - - char genre[MAXGENRELEN]; - char version[MAXGENRELEN]; - char subtype[MAXGENRELEN]; +#define XT_OSF_TTL_LESS 1 /* Check if ip TTL is less than fingerprint one */ - /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ - struct xt_osf_opt opt[MAX_IPOPTLEN]; -}; - -struct xt_osf_nlmsg { - struct xt_osf_user_finger f; - struct iphdr ip; - struct tcphdr tcp; -}; - -/* Defines for IANA option kinds */ - -enum iana_options { - OSFOPT_EOL = 0, /* End of options */ - OSFOPT_NOP, /* NOP */ - OSFOPT_MSS, /* Maximum segment size */ - OSFOPT_WSO, /* Window scale option */ - OSFOPT_SACKP, /* SACK permitted */ - OSFOPT_SACK, /* SACK */ - OSFOPT_ECHO, - OSFOPT_ECHOREPLY, - OSFOPT_TS, /* Timestamp option */ - OSFOPT_POCP, /* Partial Order Connection Permitted */ - OSFOPT_POSP, /* Partial Order Service Profile */ - - /* Others are not used in the current OSF */ - OSFOPT_EMPTY = 255, -}; - -/* - * Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum xt_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; +#define xt_osf_wc nf_osf_wc +#define xt_osf_opt nf_osf_opt +#define xt_osf_info nf_osf_info +#define xt_osf_user_finger nf_osf_user_finger +#define xt_osf_finger nf_osf_finger +#define xt_osf_nlmsg nf_osf_nlmsg /* * Add/remove fingerprint from the kernel. diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 0c7dc8315013..3b86c14ea49d 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -191,6 +191,12 @@ struct ebt_entry { unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; +static __inline__ struct ebt_entry_target * +ebt_get_target(struct ebt_entry *e) +{ + return (void *)e + e->target_offset; +} + /* {g,s}etsockopt numbers */ #define EBT_BASE_CTL 128 diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h index f3cc0ef514a7..54ed83360dac 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -17,7 +17,10 @@ #define IP6T_SRH_LAST_GT 0x0100 #define IP6T_SRH_LAST_LT 0x0200 #define IP6T_SRH_TAG 0x0400 -#define IP6T_SRH_MASK 0x07FF +#define IP6T_SRH_PSID 0x0800 +#define IP6T_SRH_NSID 0x1000 +#define IP6T_SRH_LSID 0x2000 +#define IP6T_SRH_MASK 0x3FFF /* Values for "mt_invflags" field in struct ip6t_srh */ #define IP6T_SRH_INV_NEXTHDR 0x0001 @@ -31,7 +34,10 @@ #define IP6T_SRH_INV_LAST_GT 0x0100 #define IP6T_SRH_INV_LAST_LT 0x0200 #define IP6T_SRH_INV_TAG 0x0400 -#define IP6T_SRH_INV_MASK 0x07FF +#define IP6T_SRH_INV_PSID 0x0800 +#define IP6T_SRH_INV_NSID 0x1000 +#define IP6T_SRH_INV_LSID 0x2000 +#define IP6T_SRH_INV_MASK 0x3FFF /** * struct ip6t_srh - SRH match options @@ -54,4 +60,37 @@ struct ip6t_srh { __u16 mt_invflags; }; +/** + * struct ip6t_srh1 - SRH match options (revision 1) + * @ next_hdr: Next header field of SRH + * @ hdr_len: Extension header length field of SRH + * @ segs_left: Segments left field of SRH + * @ last_entry: Last entry field of SRH + * @ tag: Tag field of SRH + * @ psid_addr: Address of previous SID in SRH SID list + * @ nsid_addr: Address of NEXT SID in SRH SID list + * @ lsid_addr: Address of LAST SID in SRH SID list + * @ psid_msk: Mask of previous SID in SRH SID list + * @ nsid_msk: Mask of next SID in SRH SID list + * @ lsid_msk: MAsk of last SID in SRH SID list + * @ mt_flags: match options + * @ mt_invflags: Invert the sense of match options + */ + +struct ip6t_srh1 { + __u8 next_hdr; + __u8 hdr_len; + __u8 segs_left; + __u8 last_entry; + __u16 tag; + struct in6_addr psid_addr; + struct in6_addr nsid_addr; + struct in6_addr lsid_addr; + struct in6_addr psid_msk; + struct in6_addr nsid_msk; + struct in6_addr lsid_msk; + __u16 mt_flags; + __u16 mt_invflags; +}; + #endif /*_IP6T_SRH_H*/ diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index f212447794bd..9a0159aebe1a 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE - -config NFT_BRIDGE_META - tristate "Netfilter nf_table bridge meta support" - depends on NFT_META - help - Add support for bridge dedicated meta key. - config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 4bc758dd4a8c..9b868861f21a 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,7 +3,6 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # -obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # packet logging diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 28a4c3490359..b286ed5596c3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb, { par->match = m->u.match; par->matchinfo = m->data; - return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH; + return !m->u.match->match(skb, par); } static inline int @@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) return (void *)entry + entry->next_offset; } +static inline const struct ebt_entry_target * +ebt_get_target_c(const struct ebt_entry *e) +{ + return ebt_get_target((struct ebt_entry *)e); +} + /* Do some firewalling */ unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, @@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, */ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); - t = (struct ebt_entry_target *) - (((char *)point) + point->target_offset); + t = ebt_get_target_c(point); /* standard target */ if (!t->u.target->target) verdict = ((struct ebt_standard_target *)t)->verdict; @@ -343,6 +348,16 @@ find_table_lock(struct net *net, const char *name, int *error, "ebtable_", error, mutex); } +static inline void ebt_free_table_info(struct ebt_table_info *info) +{ + int i; + + if (info->chainstack) { + for_each_possible_cpu(i) + vfree(info->chainstack[i]); + vfree(info->chainstack); + } +} static inline int ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, unsigned int *cnt) @@ -627,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); par.net = net; par.target = t->u.target; @@ -706,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); if (ret != 0) goto cleanup_watchers; - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); gap = e->next_offset - e->target_offset; target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); @@ -779,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack if (pos == nentries) continue; } - t = (struct ebt_entry_target *) - (((char *)e) + e->target_offset); + t = ebt_get_target_c(e); if (strcmp(t->u.name, EBT_STANDARD_TARGET)) goto letscontinue; if (e->target_offset + sizeof(struct ebt_standard_target) > @@ -975,7 +989,7 @@ static void get_counters(const struct ebt_counter *oldcounters, static int do_replace_finish(struct net *net, struct ebt_replace *repl, struct ebt_table_info *newinfo) { - int ret, i; + int ret; struct ebt_counter *counterstmp = NULL; /* used to be able to unlock earlier */ struct ebt_table_info *table; @@ -1051,13 +1065,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, ebt_cleanup_entry, net, NULL); vfree(table->entries); - if (table->chainstack) { - for_each_possible_cpu(i) - vfree(table->chainstack[i]); - vfree(table->chainstack); - } + ebt_free_table_info(table); vfree(table); - vfree(counterstmp); #ifdef CONFIG_AUDIT @@ -1078,11 +1087,7 @@ free_iterate: free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ - if (newinfo->chainstack) { - for_each_possible_cpu(i) - vfree(newinfo->chainstack[i]); - vfree(newinfo->chainstack); - } + ebt_free_table_info(newinfo); return ret; } @@ -1147,8 +1152,6 @@ free_newinfo: static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - int i; - mutex_lock(&ebt_mutex); list_del(&table->list); mutex_unlock(&ebt_mutex); @@ -1157,11 +1160,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) if (table->private->nentries) module_put(table->me); vfree(table->private->entries); - if (table->private->chainstack) { - for_each_possible_cpu(i) - vfree(table->private->chainstack[i]); - vfree(table->private->chainstack); - } + ebt_free_table_info(table->private); vfree(table->private); kfree(table); } @@ -1263,11 +1262,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: - if (newinfo->chainstack) { - for_each_possible_cpu(i) - vfree(newinfo->chainstack[i]); - vfree(newinfo->chainstack); - } + ebt_free_table_info(newinfo); vfree(newinfo->entries); free_newinfo: vfree(newinfo); @@ -1405,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, return -EFAULT; hlp = ubase + (((char *)e + e->target_offset) - base); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target_c(e); ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); if (ret != 0) @@ -1746,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, return ret; target_offset = e->target_offset - (origsize - *size); - t = (struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target(e); ret = compat_target_to_user(t, dstptr, size); if (ret) @@ -1794,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e, EBT_MATCH_ITERATE(e, compat_calc_match, &off); EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); - t = (const struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target_c(e); off += xt_compat_target_offset(t->u.target); off += ebt_compat_entry_padsize(); diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c deleted file mode 100644 index bb63c9aed55d..000000000000 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nft_meta.h> - -#include "../br_private.h" - -static void nft_meta_bridge_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - const struct nft_meta *priv = nft_expr_priv(expr); - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - u32 *dest = ®s->data[priv->dreg]; - const struct net_bridge_port *p; - - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - break; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - break; - default: - goto out; - } - - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; -out: - return nft_meta_get_eval(expr, regs, pkt); -err: - regs->verdict.code = NFT_BREAK; -} - -static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_meta *priv = nft_expr_priv(expr); - unsigned int len; - - priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - len = IFNAMSIZ; - break; - default: - return nft_meta_get_init(ctx, expr, tb); - } - - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); -} - -static struct nft_expr_type nft_meta_bridge_type; -static const struct nft_expr_ops nft_meta_bridge_get_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_bridge_get_eval, - .init = nft_meta_bridge_get_init, - .dump = nft_meta_get_dump, -}; - -static const struct nft_expr_ops nft_meta_bridge_set_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_set_eval, - .init = nft_meta_set_init, - .destroy = nft_meta_set_destroy, - .dump = nft_meta_set_dump, - .validate = nft_meta_set_validate, -}; - -static const struct nft_expr_ops * -nft_meta_bridge_select_ops(const struct nft_ctx *ctx, - const struct nlattr * const tb[]) -{ - if (tb[NFTA_META_KEY] == NULL) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG]) - return &nft_meta_bridge_get_ops; - - if (tb[NFTA_META_SREG]) - return &nft_meta_bridge_set_ops; - - return ERR_PTR(-EINVAL); -} - -static struct nft_expr_type nft_meta_bridge_type __read_mostly = { - .family = NFPROTO_BRIDGE, - .name = "meta", - .select_ops = nft_meta_bridge_select_ops, - .policy = nft_meta_policy, - .maxattr = NFTA_META_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_meta_bridge_module_init(void) -{ - return nft_register_expr(&nft_meta_bridge_type); -} - -static void __exit nft_meta_bridge_module_exit(void) -{ - nft_unregister_expr(&nft_meta_bridge_type); -} - -module_init(nft_meta_bridge_module_init); -module_exit(nft_meta_bridge_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); -MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 44b308d93ec2..444f125f3974 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -300,7 +300,7 @@ ipt_do_table(struct sk_buff *skb, counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); - t = ipt_get_target(e); + t = ipt_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index a03e4e7ef5f9..ce1512b02cb2 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; mr = par->targinfo; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0f7255cc65ee..529d89ec31e8 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -33,8 +33,7 @@ static const struct xt_table nf_nat_ipv4_table = { static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { return ipt_do_table(skb, state, state->net->ipv4.nat_table); } diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 0cd46bffa469..e1e56d7123d2 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -2,265 +2,12 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> -#include <linux/rhashtable.h> -#include <linux/ip.h> -#include <linux/netdevice.h> -#include <net/ip.h> -#include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> -/* For layer 4 checksum field offset. */ -#include <linux/tcp.h> -#include <linux/udp.h> - -static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) -{ - struct tcphdr *tcph; - - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - - tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); - - return 0; -} - -static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) -{ - struct udphdr *udph; - - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - - udph = (void *)(skb_network_header(skb) + thoff); - if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace4(&udph->check, skb, addr, - new_addr, true); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - - return 0; -} - -static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, - unsigned int thoff, __be32 addr, - __be32 new_addr) -{ - switch (iph->protocol) { - case IPPROTO_TCP: - if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - case IPPROTO_UDP: - if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - } - - return 0; -} - -static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - __be32 addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = iph->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; - iph->saddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = iph->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; - iph->daddr = new_addr; - break; - default: - return -1; - } - csum_replace4(&iph->check, addr, new_addr); - - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); -} - -static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - __be32 addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = iph->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; - iph->daddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = iph->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; - iph->saddr = new_addr; - break; - default: - return -1; - } - csum_replace4(&iph->check, addr, new_addr); - - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); -} - -static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, - enum flow_offload_tuple_dir dir) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int thoff = iph->ihl * 4; - - if (flow->flags & FLOW_OFFLOAD_SNAT && - (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && - (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - - return 0; -} - -static bool ip_has_options(unsigned int thoff) -{ - return thoff != sizeof(struct iphdr); -} - -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) -{ - struct flow_ports *ports; - unsigned int thoff; - struct iphdr *iph; - - if (!pskb_may_pull(skb, sizeof(*iph))) - return -1; - - iph = ip_hdr(skb); - thoff = iph->ihl * 4; - - if (ip_is_fragment(iph) || - unlikely(ip_has_options(thoff))) - return -1; - - if (iph->protocol != IPPROTO_TCP && - iph->protocol != IPPROTO_UDP) - return -1; - - thoff = iph->ihl * 4; - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) - return -1; - - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); - - tuple->src_v4.s_addr = iph->saddr; - tuple->dst_v4.s_addr = iph->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; - tuple->l3proto = AF_INET; - tuple->l4proto = iph->protocol; - tuple->iifidx = dev->ifindex; - - return 0; -} - -/* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) -{ - if (skb->len <= mtu) - return false; - - if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) - return false; - - if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) - return false; - - return true; -} - -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) -{ - u32 mtu; - - mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - const struct rtable *rt; - struct iphdr *iph; - __be32 nexthop; - - if (skb->protocol != htons(ETH_P_IP)) - return NF_ACCEPT; - - if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; - - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - - dir = tuplehash->tuple.dir; - flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - - rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) - return NF_ACCEPT; - - if (skb_try_make_writable(skb, sizeof(*iph))) - return NF_DROP; - - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ip(flow, skb, dir) < 0) - return NF_DROP; - - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; - iph = ip_hdr(skb); - ip_decrease_ttl(iph); - - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - - return NF_STOLEN; -} -EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index ac8342dcb55e..4e6b53ab6c33 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); @@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_callforwarding_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(new->status & IPS_NAT_DONE_MASK); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index f7ff6a364d7b..325e02956bf5 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, #endif /* CONFIG_XFRM */ static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); @@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); @@ -246,8 +246,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -285,7 +284,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(priv, skb, state, ct); + ret = do_chain(priv, skb, state); if (ret != NF_ACCEPT) return ret; @@ -326,8 +325,7 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; @@ -346,8 +344,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -383,8 +380,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index 0c366aad89cb..f538c5001547 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -24,13 +24,13 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 8a69363b4884..5d259a12e25f 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; - struct nf_nat_range range; + struct nf_nat_range2 range; struct nf_conn_nat *nat; nat = nf_ct_nat_ext_add(ct); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index edf05002d674..00fda6331ce5 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); static void gre_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 7b98baa13ede..6d7cf1d79baf 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, static void icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index b5464a3f253b..285baccfbdea 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -28,8 +28,7 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index f18677277119..f1193e1e928a 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dfd8af41824e..7f4493080df6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) -{ - unsigned int mtu; - struct inet6_dev *idev; - - if (dst_metric_locked(dst, RTAX_MTU)) { - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; - } - - mtu = IPV6_MIN_MTU; - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - - return mtu; -} -EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); - static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 65c9e1a58305..7097bbf95843 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -528,7 +528,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .family = NFPROTO_IPV6, }; - t = ip6t_get_target(e); return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 92c0047e7e33..491f808e356a 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c index 33719d5560c8..1059894a6f4c 100644 --- a/net/ipv6/netfilter/ip6t_srh.c +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) return true; } +static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0; + const struct ip6t_srh1 *srhinfo = par->matchinfo; + struct in6_addr *psid, *nsid, *lsid; + struct in6_addr _psid, _nsid, _lsid; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + + /* Previous SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_PSID) { + if (srh->segments_left == srh->first_segment) + return false; + psidoff = srhoff + sizeof(struct ipv6_sr_hdr) + + ((srh->segments_left + 1) * sizeof(struct in6_addr)); + psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID, + ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk, + &srhinfo->psid_addr))) + return false; + } + + /* Next SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_NSID) { + if (srh->segments_left == 0) + return false; + nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) + + ((srh->segments_left - 1) * sizeof(struct in6_addr)); + nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID, + ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk, + &srhinfo->nsid_addr))) + return false; + } + + /* Last SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_LSID) { + lsidoff = srhoff + sizeof(struct ipv6_sr_hdr); + lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID, + ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk, + &srhinfo->lsid_addr))) + return false; + } + return true; +} + static int srh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; @@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par) return 0; } -static struct xt_match srh_mt6_reg __read_mostly = { - .name = "srh", - .family = NFPROTO_IPV6, - .match = srh_mt6, - .matchsize = sizeof(struct ip6t_srh), - .checkentry = srh_mt6_check, - .me = THIS_MODULE, +static int srh1_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh1 *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_info_ratelimited("unknown srh match flags %X\n", + srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_info_ratelimited("unknown srh invflags %X\n", + srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg[] __read_mostly = { + { + .name = "srh", + .revision = 0, + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, + }, + { + .name = "srh", + .revision = 1, + .family = NFPROTO_IPV6, + .match = srh1_mt6, + .matchsize = sizeof(struct ip6t_srh1), + .checkentry = srh1_mt6_check, + .me = THIS_MODULE, + } }; static int __init srh_mt6_init(void) { - return xt_register_match(&srh_mt6_reg); + return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } static void __exit srh_mt6_exit(void) { - xt_unregister_match(&srh_mt6_reg); + xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } module_init(srh_mt6_init); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 47306e45a80a..2bf554e18af8 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -35,8 +35,7 @@ static const struct xt_table nf_nat_ipv6_table = { static unsigned int ip6table_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); } diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index 207cb35569b1..c511d206bf9b 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -3,256 +3,12 @@ #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> -#include <linux/ipv6.h> -#include <linux/netdevice.h> -#include <net/ipv6.h> -#include <net/ip6_route.h> -#include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> -/* For layer 4 checksum field offset. */ -#include <linux/tcp.h> -#include <linux/udp.h> - -static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) -{ - struct tcphdr *tcph; - - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - - tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, - new_addr->s6_addr32, true); - - return 0; -} - -static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) -{ - struct udphdr *udph; - - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - - udph = (void *)(skb_network_header(skb) + thoff); - if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, - new_addr->s6_addr32, true); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - - return 0; -} - -static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, struct in6_addr *addr, - struct in6_addr *new_addr) -{ - switch (ip6h->nexthdr) { - case IPPROTO_TCP: - if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - case IPPROTO_UDP: - if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - } - - return 0; -} - -static int nf_flow_snat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - struct in6_addr addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = ip6h->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; - ip6h->saddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = ip6h->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; - ip6h->daddr = new_addr; - break; - default: - return -1; - } - - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); -} - -static int nf_flow_dnat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - struct in6_addr addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = ip6h->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; - ip6h->daddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = ip6h->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; - ip6h->saddr = new_addr; - break; - default: - return -1; - } - - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); -} - -static int nf_flow_nat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, - enum flow_offload_tuple_dir dir) -{ - struct ipv6hdr *ip6h = ipv6_hdr(skb); - unsigned int thoff = sizeof(*ip6h); - - if (flow->flags & FLOW_OFFLOAD_SNAT && - (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && - (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - - return 0; -} - -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) -{ - struct flow_ports *ports; - struct ipv6hdr *ip6h; - unsigned int thoff; - - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -1; - - ip6h = ipv6_hdr(skb); - - if (ip6h->nexthdr != IPPROTO_TCP && - ip6h->nexthdr != IPPROTO_UDP) - return -1; - - thoff = sizeof(*ip6h); - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) - return -1; - - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); - - tuple->src_v6 = ip6h->saddr; - tuple->dst_v6 = ip6h->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; - tuple->l3proto = AF_INET6; - tuple->l4proto = ip6h->nexthdr; - tuple->iifidx = dev->ifindex; - - return 0; -} - -/* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) -{ - if (skb->len <= mtu) - return false; - - if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) - return false; - - return true; -} - -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) -{ - u32 mtu; - - mtu = ip6_dst_mtu_forward(&rt->dst); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - struct in6_addr *nexthop; - struct ipv6hdr *ip6h; - struct rt6_info *rt; - - if (skb->protocol != htons(ETH_P_IPV6)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; - - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - - dir = tuplehash->tuple.dir; - flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) - return NF_ACCEPT; - - if (skb_try_make_writable(skb, sizeof(*ip6h))) - return NF_DROP; - - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ipv6(flow, skb, dir) < 0) - return NF_DROP; - - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; - ip6h = ipv6_hdr(skb); - ip6h->hop_limit--; - - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - - return NF_STOLEN; -} -EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 6b7f075f811f..f1582b6f9588 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, #endif static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; @@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], @@ -257,8 +257,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -303,7 +302,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(priv, skb, state, ct); + ret = do_chain(priv, skb, state); if (ret != NF_ACCEPT) return ret; @@ -343,8 +342,7 @@ nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; @@ -363,8 +361,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -400,8 +397,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 98f61fcb9108..9dfc2b90c362 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -26,14 +26,14 @@ static atomic_t v6_worker_count; unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index 57593b00c5b4..d9bf42ba44fa 100644 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple, static void icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 3557b114446c..100a6bd1046a 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -26,8 +26,7 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 4146536e9c15..dd0122f3cffe 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index a27e424f690d..74269865acc8 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 44d8a55e9721..e57c9d479503 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -444,6 +444,9 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK +config NF_OSF + tristate 'Passive OS fingerprint infrastructure' + config NF_TABLES select NETFILTER_NETLINK tristate "Netfilter nf_tables support" @@ -474,24 +477,6 @@ config NF_TABLES_NETDEV help This option enables support for the "netdev" table. -config NFT_EXTHDR - tristate "Netfilter nf_tables exthdr module" - help - This option adds the "exthdr" expression that you can use to match - IPv6 extension headers and tcp options. - -config NFT_META - tristate "Netfilter nf_tables meta module" - help - This option adds the "meta" expression that you can use to match and - to set packet metainformation such as the packet mark. - -config NFT_RT - tristate "Netfilter nf_tables routing module" - help - This option adds the "rt" expression that you can use to match - packet routing information such as the packet nexthop. - config NFT_NUMGEN tristate "Netfilter nf_tables number generator module" help @@ -667,8 +652,7 @@ endif # NF_TABLES config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" - depends on NF_FLOW_TABLE_IPV4 - depends on NF_FLOW_TABLE_IPV6 + depends on NF_FLOW_TABLE help This option adds the flow table mixed IPv4/IPv6 support. @@ -1378,6 +1362,7 @@ config NETFILTER_XT_MATCH_NFACCT config NETFILTER_XT_MATCH_OSF tristate '"osf" Passive OS fingerprint match' depends on NETFILTER_ADVANCED && NETFILTER_NETLINK + select NF_OSF help This option selects the Passive OS Fingerprinting match module that allows to passively match the remote operating system by diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index fd32bd2c9521..1aa710b5d384 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -76,13 +76,10 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o -obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o -obj-$(CONFIG_NFT_META) += nft_meta.o -obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o @@ -104,6 +101,7 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o +obj-$(CONFIG_NF_OSF) += nf_osf.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o @@ -111,6 +109,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o + obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o # generic X tables diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index b32fb0dbe237..05dc1b77e466 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -225,6 +225,25 @@ config IP_VS_SH If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_MH + tristate "maglev hashing scheduling" + ---help--- + The maglev consistent hashing scheduling algorithm provides the + Google's Maglev hashing algorithm as a IPVS scheduler. It assigns + network connections to the servers through looking up a statically + assigned special hash table called the lookup table. Maglev hashing + is to assign a preference list of all the lookup table positions + to each destination. + + Through this operation, The maglev hashing gives an almost equal + share of the lookup table to each of the destinations and provides + minimal disruption by using the lookup table. When the set of + destinations changes, a connection will likely be sent to the same + destination as it was before. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_SED tristate "shortest expected delay scheduling" ---help--- @@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS needs to be large enough to effectively fit all the destinations multiplied by their respective weights. +comment 'IPVS MH scheduler' + +config IP_VS_MH_TAB_INDEX + int "IPVS maglev hashing table index of size (the prime numbers)" + range 8 17 + default 12 + ---help--- + The maglev hashing scheduler maps source IPs to destinations + stored in a hash table. This table is assigned by a preference + list of the positions to each destination until all slots in + the table are filled. The index determines the prime for size of + the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 65521 or 131071. When using weights to allow destinations to + receive more connections, the table is assigned an amount + proportional to the weights specified. The table needs to be large + enough to effectively fit all the destinations multiplied by their + respective weights. + comment 'IPVS application helper' config IP_VS_FTP diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index c552993fa4b9..bfce2677fda2 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f36098887ad0..d4f68d0f7df7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; + /* keep the last_weight with latest non-0 weight */ + if (add || udest->weight != 0) + atomic_set(&dest->last_weight, udest->weight); + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f798f8e83b..07459e71d907 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -43,6 +43,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/hash.h> #include <net/ip_vs.h> @@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3057e453bf31..b9f375e6dc93 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -48,6 +48,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } @@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 92adc04557ed..542c4949937a 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -47,6 +47,7 @@ #include <linux/jiffies.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } @@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c new file mode 100644 index 000000000000..0f795b186eb3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0 +/* IPVS: Maglev Hashing scheduling module + * + * Authors: Inju Song <inju.song@navercorp.com> + * + */ + +/* The mh algorithm is to assign a preference list of all the lookup + * table positions to each destination and populate the table with + * the most-preferred position of destinations. Then it is to select + * destination with the hash key of source IP address through looking + * up a the lookup table. + * + * The algorithm is detailed in: + * [3.4 Consistent Hasing] +https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + +#include <linux/siphash.h> +#include <linux/bitops.h> +#include <linux/gcd.h> + +#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ +#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ + +struct ip_vs_mh_lookup { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +struct ip_vs_mh_dest_setup { + unsigned int offset; /* starting offset */ + unsigned int skip; /* skip */ + unsigned int perm; /* next_offset */ + int turns; /* weight / gcd() and rshift */ +}; + +/* Available prime numbers for MH table */ +static int primes[] = {251, 509, 1021, 2039, 4093, + 8191, 16381, 32749, 65521, 131071}; + +/* For IPVS MH entry hash table */ +#ifndef CONFIG_IP_VS_MH_TAB_INDEX +#define CONFIG_IP_VS_MH_TAB_INDEX 12 +#endif +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] + +struct ip_vs_mh_state { + struct rcu_head rcu_head; + struct ip_vs_mh_lookup *lookup; + struct ip_vs_mh_dest_setup *dest_setup; + hsiphash_key_t hash1, hash2; + int gcd; + int rshift; +}; + +static inline void generate_hash_secret(hsiphash_key_t *hash1, + hsiphash_key_t *hash2) +{ + hash1->key[0] = 2654435761UL; + hash1->key[1] = 2654435761UL; + + hash2->key[0] = 2654446892UL; + hash2->key[1] = 2654446892UL; +} + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* Returns hash value for IPVS MH entry */ +static inline unsigned int +ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, hsiphash_key_t *key, unsigned int offset) +{ + unsigned int v; + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ + addr->ip6[2] ^ addr->ip6[3]; +#endif + v = (offset + ntohs(port) + ntohl(addr_fold)); + return hsiphash(&v, sizeof(v), key); +} + +/* Reset all the hash buckets of the specified table. */ +static void ip_vs_mh_reset(struct ip_vs_mh_state *s) +{ + int i; + struct ip_vs_mh_lookup *l; + struct ip_vs_dest *dest; + + l = &s->lookup[0]; + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(l->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(l->dest, NULL); + } + l++; + } +} + +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest; + int lw; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * permutation for the dests. + */ + if (s->gcd < 1) + return 0; + + /* Set dest_setup for the dests permutation */ + p = &svc->destinations; + ds = &s->dest_setup[0]; + while ((p = p->next) != &svc->destinations) { + dest = list_entry(p, struct ip_vs_dest, n_list); + + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash1, 0) % + IP_VS_MH_TAB_SIZE; + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash2, 0) % + (IP_VS_MH_TAB_SIZE - 1) + 1; + ds->perm = ds->offset; + + lw = atomic_read(&dest->last_weight); + ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); + ds++; + } + + return 0; +} + +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int n, c, dt_count; + unsigned long *table; + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest, *new_dest; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * the population for the dests and reset lookup table. + */ + if (s->gcd < 1) { + ip_vs_mh_reset(s); + return 0; + } + + table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), + sizeof(unsigned long), GFP_KERNEL); + if (!table) + return -ENOMEM; + + p = &svc->destinations; + n = 0; + dt_count = 0; + while (n < IP_VS_MH_TAB_SIZE) { + if (p == &svc->destinations) + p = p->next; + + ds = &s->dest_setup[0]; + while (p != &svc->destinations) { + /* Ignore added server with zero weight */ + if (ds->turns < 1) { + p = p->next; + ds++; + continue; + } + + c = ds->perm; + while (test_bit(c, table)) { + /* Add skip, mod IP_VS_MH_TAB_SIZE */ + ds->perm += ds->skip; + if (ds->perm >= IP_VS_MH_TAB_SIZE) + ds->perm -= IP_VS_MH_TAB_SIZE; + c = ds->perm; + } + + __set_bit(c, table); + + dest = rcu_dereference_protected(s->lookup[c].dest, 1); + new_dest = list_entry(p, struct ip_vs_dest, n_list); + if (dest != new_dest) { + if (dest) + ip_vs_dest_put(dest); + ip_vs_dest_hold(new_dest); + RCU_INIT_POINTER(s->lookup[c].dest, new_dest); + } + + if (++n == IP_VS_MH_TAB_SIZE) + goto out; + + if (++dt_count >= ds->turns) { + dt_count = 0; + p = p->next; + ds++; + } + } + } + +out: + kfree(table); + return 0; +} + +/* Get ip_vs_dest associated with supplied parameters. */ +static inline struct ip_vs_dest * +ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) + % IP_VS_MH_TAB_SIZE; + struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* First try the dest it's supposed to go to */ + ihash = ip_vs_mh_hashkey(svc->af, addr, port, + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + /* If the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; + hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, + roffset) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, + "MH: selected unavailable server %s:%u (offset %u), reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* Assign all the hash buckets of the specified table with the service. */ +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int ret; + + if (svc->num_dests > IP_VS_MH_TAB_SIZE) + return -EINVAL; + + if (svc->num_dests >= 1) { + s->dest_setup = kcalloc(svc->num_dests, + sizeof(struct ip_vs_mh_dest_setup), + GFP_KERNEL); + if (!s->dest_setup) + return -ENOMEM; + } + + ip_vs_mh_permutate(s, svc); + + ret = ip_vs_mh_populate(s, svc); + if (ret < 0) + goto out; + + IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + +out: + if (svc->num_dests >= 1) { + kfree(s->dest_setup); + s->dest_setup = NULL; + } + return ret; +} + +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->last_weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g; +} + +/* To avoid assigning huge weight for the MH table, + * calculate shift value with gcd. + */ +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + int mw, shift; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, return + * shift value as zero. + */ + if (gcd < 1) + return 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->last_weight); + if (new_weight > weight) + weight = new_weight; + } + + /* Because gcd is greater than zero, + * the maximum weight and gcd are always greater than zero + */ + mw = weight / gcd; + + /* shift = occupied bits of weight/gcd - MH highest bits */ + shift = fls(mw) - IP_VS_MH_TAB_BITS; + return (shift >= 0) ? shift : 0; +} + +static void ip_vs_mh_state_free(struct rcu_head *head) +{ + struct ip_vs_mh_state *s; + + s = container_of(head, struct ip_vs_mh_state, rcu_head); + kfree(s->lookup); + kfree(s); +} + +static int ip_vs_mh_init_svc(struct ip_vs_service *svc) +{ + int ret; + struct ip_vs_mh_state *s; + + /* Allocate the MH table for this service */ + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), + GFP_KERNEL); + if (!s->lookup) { + kfree(s); + return -ENOMEM; + } + + generate_hash_secret(&s->hash1, &s->hash2); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + IP_VS_DBG(6, + "MH lookup table (memory=%zdbytes) allocated for current service\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); + + /* Assign the lookup table with current dests */ + ret = ip_vs_mh_reassign(s, svc); + if (ret < 0) { + ip_vs_mh_reset(s); + ip_vs_mh_state_free(&s->rcu_head); + return ret; + } + + /* No more failures, attach state */ + svc->sched_data = s; + return 0; +} + +static void ip_vs_mh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + /* Got to clean up lookup entry here */ + ip_vs_mh_reset(s); + + call_rcu(&s->rcu_head, ip_vs_mh_state_free); + IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); +} + +static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + /* Assign the lookup table with the updated service */ + return ip_vs_mh_reassign(s, svc); +} + +/* Helper function to get port number */ +static inline __be16 +ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + +/* Maglev Hashing scheduling */ +static struct ip_vs_dest * +ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_mh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + + IP_VS_DBG(6, "%s : Scheduling...\n", __func__); + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) + port = ip_vs_mh_get_port(skb, iph); + + s = (struct ip_vs_mh_state *)svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) + dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); + else + dest = ip_vs_mh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + ntohs(port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + +/* IPVS MH Scheduler structure */ +static struct ip_vs_scheduler ip_vs_mh_scheduler = { + .name = "mh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), + .init_service = ip_vs_mh_init_svc, + .done_service = ip_vs_mh_done_svc, + .add_dest = ip_vs_mh_dest_changed, + .del_dest = ip_vs_mh_dest_changed, + .upd_dest = ip_vs_mh_dest_changed, + .schedule = ip_vs_mh_schedule, +}; + +static int __init ip_vs_mh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_mh_scheduler); +} + +static void __exit ip_vs_mh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); + rcu_barrier(); +} + +module_init(ip_vs_mh_init); +module_exit(ip_vs_mh_cleanup); +MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>"); diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index bcd9b7bde4ee..569631d2b2a1 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -436,7 +436,7 @@ static bool tcp_state_active(int state) return tcp_state_active_table[state]; } -static struct tcp_states_t tcp_states [] = { +static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, @@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = { /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t tcp_states_dos [] = { +static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc9..1e01c782583a 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + return (offset + hash_32(ntohs(port) + ntohl(addr_fold), + IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 41ff04ee2554..605441727008 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -186,6 +186,7 @@ unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_t nf_conntrack_generation __read_mostly; static unsigned int nf_conntrack_hash_rnd __read_mostly; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index f0e9a7511e1a..a11c304fb771 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { .timeout = 5 * 60, }; -/* don't make this __exit, since it's called from __init ! */ -static void nf_conntrack_ftp_fini(void) +static void __exit nf_conntrack_ftp_fini(void) { nf_conntrack_helpers_unregister(ftp, ports_c * 2); kfree(ftp_buffer); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 5523acce9d69..4099f4d79bae 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff, static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; static struct nf_conntrack_expect_policy irc_exp_policy; -static void nf_conntrack_irc_fini(void); - static int __init nf_conntrack_irc_init(void) { int i, ret; @@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void) return 0; } -/* This function is intentionally _NOT_ defined as __exit, because - * it is needed by the init function */ -static void nf_conntrack_irc_fini(void) +static void __exit nf_conntrack_irc_fini(void) { nf_conntrack_helpers_unregister(irc, ports_c); kfree(irc_buffer); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4c1d0c5bc268..d807b8770be3 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; + if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) + goto nla_put_failure; + nlmsg_end(skb, nlh); return skb->len; diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index ae457f39d5ce..5072ff96ab33 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { .timeout = 5 * 60, }; -/* don't make this __exit, since it's called from __init ! */ -static void nf_conntrack_sane_fini(void) +static void __exit nf_conntrack_sane_fini(void) { nf_conntrack_helpers_unregister(sane, ports_c * 2); kfree(sane_buffer); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 908e51e2dc2b..c8d2b6688a2a 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1617,7 +1617,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1 }, }; -static void nf_conntrack_sip_fini(void) +static void __exit nf_conntrack_sip_fini(void) { nf_conntrack_helpers_unregister(sip, ports_c * 4); } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 0ec6779fd5d9..548b673b3625 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = { .timeout = 5 * 60, }; -static void nf_conntrack_tftp_fini(void) +static void __exit nf_conntrack_tftp_fini(void) { nf_conntrack_helpers_unregister(tftp, ports_c * 2); } diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c index ec410cae9307..eb0d1658ac05 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table_core.c @@ -4,6 +4,8 @@ #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> +#include <net/ip.h> +#include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> @@ -16,6 +18,43 @@ struct flow_offload_entry { struct rcu_head rcu_head; }; +static DEFINE_MUTEX(flowtable_lock); +static LIST_HEAD(flowtables); + +static void +flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, + struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; + struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; + struct dst_entry *dst = route->tuple[dir].dst; + + ft->dir = dir; + + switch (ctt->src.l3num) { + case NFPROTO_IPV4: + ft->src_v4 = ctt->src.u3.in; + ft->dst_v4 = ctt->dst.u3.in; + ft->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + ft->src_v6 = ctt->src.u3.in6; + ft->dst_v6 = ctt->dst.u3.in6; + ft->mtu = ip6_dst_mtu_forward(dst); + break; + } + + ft->l3proto = ctt->src.l3num; + ft->l4proto = ctt->dst.protonum; + ft->src_port = ctt->src.u.tcp.port; + ft->dst_port = ctt->dst.u.tcp.port; + + ft->iifidx = route->tuple[dir].ifindex; + ft->oifidx = route->tuple[!dir].ifindex; + ft->dst_cache = dst; +} + struct flow_offload * flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) { @@ -40,69 +79,12 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) entry->ct = ct; - switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { - case NFPROTO_IPV4: - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; - break; - case NFPROTO_IPV6: - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; - break; - } - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = - FLOW_OFFLOAD_DIR_ORIGINAL; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = - FLOW_OFFLOAD_DIR_REPLY; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) flow->flags |= FLOW_OFFLOAD_SNAT; - else if (ct->status & IPS_DST_NAT) + if (ct->status & IPS_DST_NAT) flow->flags |= FLOW_OFFLOAD_DNAT; return flow; @@ -118,6 +100,43 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +{ + tcp->state = TCP_CONNTRACK_ESTABLISHED; + tcp->seen[0].td_maxwin = 0; + tcp->seen[1].td_maxwin = 0; +} + +static void flow_offload_fixup_ct_state(struct nf_conn *ct) +{ + const struct nf_conntrack_l4proto *l4proto; + struct net *net = nf_ct_net(ct); + unsigned int *timeouts; + unsigned int timeout; + int l4num; + + l4num = nf_ct_protonum(ct); + if (l4num == IPPROTO_TCP) + flow_offload_fixup_tcp(&ct->proto.tcp); + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num); + if (!l4proto) + return; + + timeouts = l4proto->get_timeouts(net); + if (!timeouts) + return; + + if (l4num == IPPROTO_TCP) + timeout = timeouts[TCP_CONNTRACK_ESTABLISHED]; + else if (l4num == IPPROTO_UDP) + timeout = timeouts[UDP_CT_REPLIED]; + else + return; + + ct->timeout = nfct_time_stamp + timeout; +} + void flow_offload_free(struct flow_offload *flow) { struct flow_offload_entry *e; @@ -125,17 +144,46 @@ void flow_offload_free(struct flow_offload *flow) dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); e = container_of(flow, struct flow_offload_entry, flow); - nf_ct_delete(e->ct, 0, 0); + if (flow->flags & FLOW_OFFLOAD_DYING) + nf_ct_delete(e->ct, 0, 0); nf_ct_put(e->ct); kfree_rcu(e, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); -void flow_offload_dead(struct flow_offload *flow) +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - flow->flags |= FLOW_OFFLOAD_DYING; + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; } -EXPORT_SYMBOL_GPL(flow_offload_dead); + +static const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -143,10 +191,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -154,22 +202,51 @@ EXPORT_SYMBOL_GPL(flow_offload_add); static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { + struct flow_offload_entry *e; + rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); + + e = container_of(flow, struct flow_offload_entry, flow); + clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); flow_offload_free(flow); } +void flow_offload_teardown(struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + flow->flags |= FLOW_OFFLOAD_TEARDOWN; + + e = container_of(flow, struct flow_offload_entry, flow); + flow_offload_fixup_ct_state(e->ct); +} +EXPORT_SYMBOL_GPL(flow_offload_teardown); + struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { - return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, - *flow_table->type->params); + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload *flow; + int dir; + + tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple, + nf_flow_offload_rhash_params); + if (!tuplehash) + return NULL; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + return NULL; + + return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -216,11 +293,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static inline bool nf_flow_is_dying(const struct flow_offload *flow) -{ - return flow->flags & FLOW_OFFLOAD_DYING; -} - static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; @@ -248,7 +320,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); if (nf_flow_has_expired(flow) || - nf_flow_is_dying(flow)) + (flow->flags & (FLOW_OFFLOAD_DYING | + FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } out: @@ -258,7 +331,7 @@ out: return 1; } -void nf_flow_offload_work_gc(struct work_struct *work) +static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; @@ -266,42 +339,6 @@ void nf_flow_offload_work_gc(struct work_struct *work) nf_flow_offload_gc_step(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } -EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); - -static u32 flow_offload_hash(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple *tuple = data; - - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple_rhash *tuplehash = data; - - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct flow_offload_tuple *tuple = arg->key; - const struct flow_offload_tuple_rhash *x = ptr; - - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) - return 1; - - return 0; -} - -const struct rhashtable_params nf_flow_offload_rhash_params = { - .head_offset = offsetof(struct flow_offload_tuple_rhash, node), - .hashfn = flow_offload_hash, - .obj_hashfn = flow_offload_hash_obj, - .obj_cmpfn = flow_offload_hash_cmp, - .automatic_shrinking = true, -}; -EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params); static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) @@ -419,33 +456,69 @@ int nf_flow_dnat_port(const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); +int nf_flow_table_init(struct nf_flowtable *flowtable) +{ + int err; + + INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + + err = rhashtable_init(&flowtable->rhashtable, + &nf_flow_offload_rhash_params); + if (err < 0) + return err; + + queue_delayed_work(system_power_efficient_wq, + &flowtable->gc_work, HZ); + + mutex_lock(&flowtable_lock); + list_add(&flowtable->list, &flowtables); + mutex_unlock(&flowtable_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_table_init); + static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + if (!dev) { + flow_offload_teardown(flow); return; + } - flow_offload_dead(flow); + if (flow->tuplehash[0].tuple.iifidx == dev->ifindex || + flow->tuplehash[1].tuple.iifidx == dev->ifindex) + flow_offload_dead(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - void *data) + struct net_device *dev) { - nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data); + nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); } void nf_flow_table_cleanup(struct net *net, struct net_device *dev) { - nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev); + struct nf_flowtable *flowtable; + + mutex_lock(&flowtable_lock); + list_for_each_entry(flowtable, &flowtables, list) + nf_flow_table_iterate_cleanup(flowtable, dev); + mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { + mutex_lock(&flowtable_lock); + list_del(&flow_table->list); + mutex_unlock(&flowtable_lock); + cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); WARN_ON(!nf_flow_offload_gc_step(flow_table)); + rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 375a1881d93d..99771aa7e7ea 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c new file mode 100644 index 000000000000..82451b7e0acb --- /dev/null +++ b/net/netfilter/nf_flow_table_ip.c @@ -0,0 +1,487 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_state_check(struct flow_offload *flow, int proto, + struct sk_buff *skb, unsigned int thoff) +{ + struct tcphdr *tcph; + + if (proto != IPPROTO_TCP) + return 0; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + if (unlikely(tcph->fin || tcph->rst)) { + flow_offload_teardown(flow); + return -1; + } + + return 0; +} + +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, enum flow_offload_tuple_dir dir) +{ + struct iphdr *iph = ip_hdr(skb); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + + return 0; +} + +static bool ip_has_options(unsigned int thoff) +{ + return thoff != sizeof(struct iphdr); +} + +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(ip_has_options(thoff))) + return -1; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) + return false; + + return true; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + const struct rtable *rt; + unsigned int thoff; + struct iphdr *iph; + __be32 nexthop; + + if (skb->protocol != htons(ETH_P_IP)) + return NF_ACCEPT; + + if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && + (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + thoff = ip_hdr(skb)->ihl * 4; + if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) + return NF_ACCEPT; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ip(flow, skb, thoff, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); + +static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + + return 0; +} + +static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) +{ + switch (ip6h->nexthdr) { + case IPPROTO_TCP: + if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; + ip6h->daddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; + ip6h->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + unsigned int thoff = sizeof(*ip6h); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + + return 0; +} + +static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return -1; + + thoff = sizeof(*ip6h); + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + tuple->iifidx = dev->ifindex; + + return 0; +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + struct in6_addr *nexthop; + struct ipv6hdr *ip6h; + struct rt6_info *rt; + + if (skb->protocol != htons(ETH_P_IPV6)) + return NF_ACCEPT; + + if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + return NF_ACCEPT; + + if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb, + sizeof(*ip6h))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*ip6h))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ipv6(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 617693ff9f4c..37b3c9913b08 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple); static int in_range(const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. @@ -194,7 +194,7 @@ find_appropriate_src(struct net *net, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; @@ -224,7 +224,7 @@ find_appropriate_src(struct net *net, static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone, static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { - if (l4proto->in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && + l4proto->in_range(tuple, maniptype, + &range->min_proto, + &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) goto out; @@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, } } - /* Last change: get protocol to try to obtain unique tuple. */ + /* Last chance: get protocol to try to obtain unique tuple. */ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); out: rcu_read_unlock(); @@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); @@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); - struct nf_nat_range range = { + struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, @@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, - struct nf_nat_range *range) + struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; const struct nf_nat_l4proto *l4proto; @@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { static int nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_conn *ct, struct nf_nat_range2 *range, const struct nf_nat_l3proto *l3proto) { struct nlattr *tb[CTA_NAT_MAX+1]; @@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_l3proto *l3proto; int err; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 607a373379b4..99606baedda4 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet); void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 7d7466dbf663..5d849d835561 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover) @@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, : tuple->src.u.all); } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { off = prandom_u32(); + } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) { + off = (ntohs(*portptr) - ntohs(range->base_proto.all)); } else { off = *rover; } @@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, *portptr = htons(min + off % range_size); if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) + if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL| + NF_NAT_RANGE_PROTO_OFFSET))) *rover = off; return; } @@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index 269fcd5dc34c..67ea0d83aa5a 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover; static void dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index c57ee3240b1d..1c5d9b65fbba 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover; static void sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 4f8820fc5148..f15fcd475f98 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -23,7 +23,7 @@ static u16 tcp_port_rover; static void tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index edd4a77dc09a..5790f70a83b2 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -22,7 +22,7 @@ static u16 udp_port_rover; static void udp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { @@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, static void udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c index 6e494d584412..c5db3e251232 100644 --- a/net/netfilter/nf_nat_proto_unknown.c +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 25b06b959118..7c4bb0a773ca 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); @@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; struct in6_addr newdst; enum ip_conntrack_info ctinfo; struct nf_conn *ct; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 791fac4fd745..1f3086074981 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, static void nf_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c new file mode 100644 index 000000000000..5ba5c7bef2f9 --- /dev/null +++ b/net/netfilter/nf_osf.c @@ -0,0 +1,218 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/kernel.h> + +#include <linux/capability.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netfilter/nf_osf.h> + +static inline int nf_osf_ttl(const struct sk_buff *skb, + const struct nf_osf_info *info, + unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (info->flags & NF_OSF_TTL) { + if (info->ttl == NF_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (info->ttl == NF_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +bool +nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers) +{ + const unsigned char *optp = NULL, *_optp = NULL; + unsigned int optsize = 0, check_WSS = 0; + int fmatch = FMATCH_WRONG, fcount = 0; + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + u16 window, totlen, mss = 0; + const struct tcphdr *tcp; + struct tcphdr _tcph; + bool df; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return false; + + if (!tcp->syn) + return false; + + totlen = ntohs(ip->tot_len); + df = ntohs(ip->frag_off) & IP_DF; + window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), optsize, opts); + } + + list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) { + int foptsize, optnum; + + f = &kf->finger; + + if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + optp = _optp; + fmatch = FMATCH_WRONG; + + if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl)) + continue; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + continue; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + continue; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + if (fmatch != FMATCH_OK) + continue; + + fcount++; + + if (info->flags & NF_OSF_LOG) + nf_log_packet(net, family, hooknum, skb, + in, out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & NF_OSF_LOG) && + info->loglevel == NF_OSF_LOGLEVEL_FIRST) + break; + } + + if (!fcount && (info->flags & NF_OSF_LOG)) + nf_log_packet(net, family, hooknum, skb, in, out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} +EXPORT_SYMBOL_GPL(nf_osf_match); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 04d4e3772584..18bd584fadda 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -386,13 +386,17 @@ static struct nft_table *nft_table_lookup(const struct net *net, { struct nft_table *table; + if (nla == NULL) + return ERR_PTR(-EINVAL); + list_for_each_entry(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) return table; } - return NULL; + + return ERR_PTR(-ENOENT); } static struct nft_table *nft_table_lookup_byhandle(const struct net *net, @@ -406,37 +410,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_active_genmask(table, genmask)) return table; } - return NULL; -} - -static struct nft_table *nf_tables_table_lookup(const struct net *net, - const struct nlattr *nla, - u8 family, u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup(net, nla, family, genmask); - if (table != NULL) - return table; - - return ERR_PTR(-ENOENT); -} - -static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, - const struct nlattr *nla, - u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup_byhandle(net, nla, genmask); - if (table != NULL) - return table; return ERR_PTR(-ENOENT); } @@ -608,10 +581,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -727,21 +701,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - const struct nlattr *name; - struct nft_table *table; int family = nfmsg->nfgen_family; + const struct nlattr *attr; + struct nft_table *table; u32 flags = 0; struct nft_ctx ctx; int err; - name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(net, name, family, genmask); + attr = nla[NFTA_TABLE_NAME]; + table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; @@ -760,7 +736,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table == NULL) goto err_kzalloc; - table->name = nla_strdup(name, GFP_KERNEL); + table->name = nla_strdup(attr, GFP_KERNEL); if (table->name == NULL) goto err_strdup; @@ -883,8 +859,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_table *table; int family = nfmsg->nfgen_family; + const struct nlattr *attr; + struct nft_table *table; struct nft_ctx ctx; nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); @@ -892,16 +869,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); - if (nla[NFTA_TABLE_HANDLE]) - table = nf_tables_table_lookup_byhandle(net, - nla[NFTA_TABLE_HANDLE], - genmask); - else - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], - family, genmask); + if (nla[NFTA_TABLE_HANDLE]) { + attr = nla[NFTA_TABLE_HANDLE]; + table = nft_table_lookup_byhandle(net, attr, genmask); + } else { + attr = nla[NFTA_TABLE_NAME]; + table = nft_table_lookup(net, attr, family, genmask); + } - if (IS_ERR(table)) + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); + } if (nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) @@ -949,8 +928,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, - u8 genmask) +nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) { struct nft_chain *chain; @@ -963,9 +941,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, return ERR_PTR(-ENOENT); } -static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +static struct nft_chain *nft_chain_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_chain *chain; @@ -1194,14 +1171,17 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1513,8 +1493,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], - genmask); + chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1564,9 +1543,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nlattr * uninitialized_var(name); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; u8 policy = NF_ACCEPT; @@ -1576,36 +1555,46 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } chain = NULL; - name = nla[NFTA_CHAIN_NAME]; + attr = nla[NFTA_CHAIN_NAME]; if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup_byhandle(table, handle, genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]); return PTR_ERR(chain); + } + attr = nla[NFTA_CHAIN_HANDLE]; } else { - chain = nf_tables_chain_lookup(table, name, genmask); + chain = nft_chain_lookup(table, attr, genmask); if (IS_ERR(chain)) { - if (PTR_ERR(chain) != -ENOENT) + if (PTR_ERR(chain) != -ENOENT) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); + } chain = NULL; } } if (nla[NFTA_CHAIN_POLICY]) { if (chain != NULL && - !nft_is_base_chain(chain)) + !nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; + } if (chain == NULL && - nla[NFTA_CHAIN_HOOK] == NULL) + nla[NFTA_CHAIN_HOOK] == NULL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; + } policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); switch (policy) { @@ -1620,8 +1609,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; @@ -1638,28 +1629,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; - int family = nfmsg->nfgen_family; struct nft_ctx ctx; u64 handle; u32 use; int err; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } if (nla[NFTA_CHAIN_HANDLE]) { - handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + attr = nla[NFTA_CHAIN_HANDLE]; + handle = be64_to_cpu(nla_get_be64(attr)); + chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + attr = nla[NFTA_CHAIN_NAME]; + chain = nft_chain_lookup(table, attr, genmask); } - if (IS_ERR(chain)) + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); + } if (nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) @@ -1681,8 +1678,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, /* There are rules and elements that are still holding references to us, * we cannot do a recursive removal in this case. */ - if (use > 0) + if (use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } return nft_delchain(&ctx); } @@ -1939,8 +1938,8 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, - u64 handle) +static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, + u64 handle) { struct nft_rule *rule; @@ -1953,13 +1952,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, - const struct nlattr *nla) +static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, + const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -2191,18 +2190,23 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } - rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -2265,23 +2269,30 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nf_tables_rule_lookup(chain, handle); - if (IS_ERR(rule)) + rule = __nft_rule_lookup(chain, handle); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else @@ -2300,9 +2311,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nf_tables_rule_lookup(chain, pos_handle); - if (IS_ERR(old_rule)) + old_rule = __nft_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); + } } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -2440,32 +2453,37 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], - genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nf_tables_rule_lookup(chain, - nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); - if (IS_ERR(rule)) + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); + } err = nft_delrule(&ctx, rule); } else { @@ -2510,14 +2528,12 @@ void nft_unregister_set(struct nft_set_type *type) EXPORT_SYMBOL_GPL(nft_unregister_set); #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ - NFT_SET_TIMEOUT | NFT_SET_OBJECT) + NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ + NFT_SET_EVAL) -static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) +static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) { - if ((flags & NFT_SET_EVAL) && !ops->update) - return false; - - return (flags & ops->features) == (flags & NFT_SET_FEATURES); + return (flags & type->features) == (flags & NFT_SET_FEATURES); } /* @@ -2554,14 +2570,9 @@ nft_select_set_ops(const struct nft_ctx *ctx, best.space = ~0; list_for_each_entry(type, &nf_tables_set_types, list) { - if (!type->select_ops) - ops = type->ops; - else - ops = type->select_ops(ctx, desc, flags); - if (!ops) - continue; + ops = &type->ops; - if (!nft_set_ops_candidate(ops, flags)) + if (!nft_set_ops_candidate(type, flags)) continue; if (!ops->estimate(desc, flags, &est)) continue; @@ -2592,7 +2603,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!try_module_get(type->owner)) continue; if (bops != NULL) - module_put(bops->type->owner); + module_put(to_set_type(bops)->owner); bops = ops; best = est; @@ -2633,6 +2644,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], + struct netlink_ext_ack *extack, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2640,18 +2652,20 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, struct nft_table *table = NULL; if (nla[NFTA_SET_TABLE] != NULL) { - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); + } } nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } -static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2666,14 +2680,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask) { struct nft_set *set; - if (nla == NULL) - return ERR_PTR(-EINVAL); - list_for_each_entry(set, &table->sets, list) { if (be64_to_cpu(nla_get_be64(nla)) == set->handle && nft_active_genmask(set, genmask)) @@ -2682,9 +2694,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla, - u8 genmask) +static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nlattr *nla, u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); @@ -2708,12 +2719,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nf_tables_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; - set = nf_tables_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, nla_set_id, genmask); } return set; } @@ -2773,6 +2784,27 @@ cont: return 0; } +static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +{ + u64 ms = be64_to_cpu(nla_get_be64(nla)); + u64 max = (u64)(~((u64)0)); + + max = div_u64(max, NSEC_PER_MSEC); + if (ms >= max) + return -ERANGE; + + ms *= NSEC_PER_MSEC; + *result = nsecs_to_jiffies64(ms); + return 0; +} + +static u64 nf_jiffies64_to_msecs(u64 input) +{ + u64 ms = jiffies64_to_nsecs(input); + + return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -2820,7 +2852,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(set->timeout)), + nf_jiffies64_to_msecs(set->timeout), NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && @@ -2958,7 +2990,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; @@ -2985,7 +3018,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3115,8 +3148,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_TIMEOUT]))); + + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + if (err) + return err; } gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { @@ -3137,22 +3172,28 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { - if (PTR_ERR(set) != -ENOENT) + if (PTR_ERR(set) != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + return 0; } @@ -3229,14 +3270,14 @@ err3: err2: kvfree(set); err1: - module_put(ops->type->owner); + module_put(to_set_type(ops)->owner); return err; } static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); - module_put(set->ops->type->owner); + module_put(to_set_type(set->ops)->owner); kfree(set->name); kvfree(set); } @@ -3255,6 +3296,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; int err; @@ -3264,20 +3306,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - if (nla[NFTA_SET_HANDLE]) - set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); - else - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); - if (IS_ERR(set)) - return PTR_ERR(set); + if (nla[NFTA_SET_HANDLE]) { + attr = nla[NFTA_SET_HANDLE]; + set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + } else { + attr = nla[NFTA_SET_NAME]; + set = nft_set_lookup(ctx.table, attr, genmask); + } + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, attr); + return PTR_ERR(set); + } if (!list_empty(&set->bindings) || - (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) + (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } return nft_delset(&ctx, set); } @@ -3367,8 +3417,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u64), }, [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(unsigned long), - .align = __alignof__(unsigned long), + .len = sizeof(u64), + .align = __alignof__(u64), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -3405,16 +3455,19 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], + struct netlink_ext_ack *extack, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; struct nft_table *table; - table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); + } nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; @@ -3458,22 +3511,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(jiffies_to_msecs( - *nft_set_ext_timeout(ext))), + nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), NFTA_SET_ELEM_PAD)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - unsigned long expires, now = jiffies; + u64 expires, now = get_jiffies_64(); expires = *nft_set_ext_expiration(ext); - if (time_before(now, expires)) + if (time_before64(now, expires)) expires -= now; else expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - cpu_to_be64(jiffies_to_msecs(expires)), + nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; } @@ -3744,12 +3796,12 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3848,7 +3900,7 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) *nft_set_ext_expiration(ext) = - jiffies + timeout; + get_jiffies_64() + timeout; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -3935,8 +3987,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_ELEM_TIMEOUT]))); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT], + &timeout); + if (err) + return err; } else if (set->flags & NFT_SET_TIMEOUT) { timeout = set->timeout; } @@ -3961,8 +4015,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EINVAL; goto err2; } - obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], - set->objtype, genmask); + obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err2; @@ -4099,7 +4153,8 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; @@ -4287,12 +4342,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -4380,9 +4435,9 @@ void nft_unregister_obj(struct nft_object_type *obj_type) } EXPORT_SYMBOL_GPL(nft_unregister_obj); -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask) { struct nft_object *obj; @@ -4394,11 +4449,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +EXPORT_SYMBOL_GPL(nft_obj_lookup); -static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) { struct nft_object *obj; @@ -4542,22 +4597,25 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - if (err != -ENOENT) + if (err != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return err; - + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; - + } return 0; } @@ -4768,15 +4826,18 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); - if (IS_ERR(obj)) + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return PTR_ERR(obj); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -4815,6 +4876,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -4824,22 +4886,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (nla[NFTA_OBJ_HANDLE]) - obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], - objtype, genmask); - else - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], - objtype, genmask); - if (IS_ERR(obj)) + if (nla[NFTA_OBJ_HANDLE]) { + attr = nla[NFTA_OBJ_HANDLE]; + obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask); + } else { + attr = nla[NFTA_OBJ_NAME]; + obj = nft_obj_lookup(table, attr, objtype, genmask); + } + + if (IS_ERR(obj)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); - if (obj->use > 0) + } + if (obj->use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -4910,9 +4979,8 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -4923,11 +4991,11 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); +EXPORT_SYMBOL_GPL(nft_flowtable_lookup); static struct nft_flowtable * -nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +nft_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -5026,7 +5094,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, flowtable->ops[i].pf = NFPROTO_NETDEV; flowtable->ops[i].hooknum = hooknum; flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data.rhashtable; + flowtable->ops[i].priv = &flowtable->data; flowtable->ops[i].hook = flowtable->data.type->hook; flowtable->ops[i].dev = dev_array[i]; flowtable->dev_name[i] = kstrdup(dev_array[i]->name, @@ -5067,23 +5135,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) return ERR_PTR(-ENOENT); } -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data) -{ - struct nft_flowtable *flowtable; - const struct nft_table *table; - - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - iter(&flowtable->data, data); - } - } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_flow_table_iterate); - static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { @@ -5117,20 +5168,26 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); - if (err != -ENOENT) + if (err != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return err; + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; + } return 0; } @@ -5157,14 +5214,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, } flowtable->data.type = type; - err = rhashtable_init(&flowtable->data.rhashtable, type->params); + err = type->init(&flowtable->data); if (err < 0) goto err3; err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], flowtable); if (err < 0) - goto err3; + goto err4; for (i = 0; i < flowtable->ops_len; i++) { if (!flowtable->ops[i].dev) @@ -5178,37 +5235,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (flowtable->ops[i].dev == ft->ops[k].dev && flowtable->ops[i].pf == ft->ops[k].pf) { err = -EBUSY; - goto err4; + goto err5; } } } err = nf_register_net_hook(net, &flowtable->ops[i]); if (err < 0) - goto err4; + goto err5; } err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; - - INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc); - queue_delayed_work(system_power_efficient_wq, - &flowtable->data.gc_work, HZ); + goto err6; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err5: +err6: i = flowtable->ops_len; -err4: +err5: for (k = i - 1; k >= 0; k--) { kfree(flowtable->dev_name[k]); nf_unregister_net_hook(net, &flowtable->ops[k]); } kfree(flowtable->ops); +err4: + flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); err2: @@ -5228,6 +5283,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -5236,23 +5292,29 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - if (nla[NFTA_FLOWTABLE_HANDLE]) - flowtable = nf_tables_flowtable_lookup_byhandle(table, - nla[NFTA_FLOWTABLE_HANDLE], - genmask); - else - flowtable = nf_tables_flowtable_lookup(table, - nla[NFTA_FLOWTABLE_NAME], - genmask); - if (IS_ERR(flowtable)) - return PTR_ERR(flowtable); - if (flowtable->use > 0) + if (nla[NFTA_FLOWTABLE_HANDLE]) { + attr = nla[NFTA_FLOWTABLE_HANDLE]; + flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); + } else { + attr = nla[NFTA_FLOWTABLE_NAME]; + flowtable = nft_flowtable_lookup(table, attr, genmask); + } + + if (IS_ERR(flowtable)) { + NL_SET_BAD_ATTR(extack, attr); + return PTR_ERR(flowtable); + } + if (flowtable->use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -5433,13 +5495,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); @@ -5492,11 +5554,9 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - cancel_delayed_work_sync(&flowtable->data.gc_work); kfree(flowtable->ops); kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); - rhashtable_destroy(&flowtable->data.rhashtable); module_put(flowtable->data.type->owner); } @@ -6410,8 +6470,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index dfd0bf3810d2..9cf47c4cb9d5 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -251,6 +251,9 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_payload_type, &nft_dynset_type, &nft_range_type, + &nft_meta_type, + &nft_rt_type, + &nft_exthdr_type, }; int __init nf_tables_core_module_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 7b46aa4c478d..e5cc4d9b9ce7 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -37,7 +37,6 @@ #include <net/sock.h> #include <net/netfilter/nf_log.h> #include <net/netns/generic.h> -#include <net/netfilter/nfnetlink_log.h> #include <linux/atomic.h> #include <linux/refcount.h> @@ -47,6 +46,7 @@ #include "../bridge/br_private.h" #endif +#define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ @@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = { }; /* log handler for internal netfilter logging api */ -void +static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, @@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net, struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; - unsigned int plen; + unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; struct nf_conn *ct = NULL; @@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net, if (!inst) return; - plen = 0; if (prefix) plen = strlen(prefix) + 1; @@ -760,7 +759,6 @@ alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } -EXPORT_SYMBOL_GPL(nfulnl_log_packet); static int nfulnl_rcv_nl_event(struct notifier_block *this, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 04863fad05dd..b07a3fd9eeea 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, u64 timeout; void *elem; - if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) + if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; timeout = priv->timeout ? : set->timeout; @@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = jiffies + timeout; + *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; } if (sexpr != NULL) @@ -216,6 +216,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (set->size == 0) + set->size = 0xffff; + priv->set = set; return 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 47ec1046ad11..a940c9fd9045 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,11 +10,10 @@ #include <asm/unaligned.h> #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/tcp.h> @@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static struct nft_expr_type nft_exthdr_type; static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EOPNOTSUPP); } -static struct nft_expr_type nft_exthdr_type __read_mostly = { +struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", .select_ops = nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, }; - -static int __init nft_exthdr_module_init(void) -{ - return nft_register_expr(&nft_exthdr_type); -} - -static void __exit nft_exthdr_module_exit(void) -{ - nft_unregister_expr(&nft_exthdr_type); -} - -module_init(nft_exthdr_module_init); -module_exit(nft_exthdr_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b65829b2be22..d6bab8c3cbb0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nf_tables_flowtable_lookup(ctx->table, - tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 24f2f7567ddb..e235c17f1b8b 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -97,7 +97,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); - if (priv->modulus <= 1) + if (priv->modulus < 1) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8fb91940e2e7..5348bd058c88 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2014 Intel Corporation + * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,8 +11,6 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -24,21 +24,35 @@ #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -#include <net/netfilter/nft_meta.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +struct nft_meta { + enum nft_meta_keys key:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +#ifdef CONFIG_NF_TABLES_BRIDGE +#include "../bridge/br_private.h" +#endif + +static void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; +#ifdef CONFIG_NF_TABLES_BRIDGE + const struct net_bridge_port *p; +#endif switch (priv->key) { case NFT_META_LEN: @@ -215,6 +229,18 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, !!skb->sp); break; #endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; +#endif default: WARN_ON(1); goto err; @@ -224,11 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } -EXPORT_SYMBOL_GPL(nft_meta_get_eval); -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -258,18 +283,16 @@ void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_meta_set_eval); -const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; -EXPORT_SYMBOL_GPL(nft_meta_policy); -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -318,6 +341,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + if (ctx->family != NFPROTO_BRIDGE) + return -EOPNOTSUPP; + len = IFNAMSIZ; + break; +#endif default: return -EOPNOTSUPP; } @@ -326,7 +357,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } -EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -360,9 +390,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -388,11 +418,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } -EXPORT_SYMBOL_GPL(nft_meta_set_validate); -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -424,10 +453,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } -EXPORT_SYMBOL_GPL(nft_meta_set_init); -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -440,10 +468,8 @@ int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -457,19 +483,16 @@ int nft_meta_set_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_set_dump); -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } -EXPORT_SYMBOL_GPL(nft_meta_set_destroy); -static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -508,27 +531,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); } -static struct nft_expr_type nft_meta_type __read_mostly = { +struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; - -static int __init nft_meta_module_init(void) -{ - return nft_register_expr(&nft_meta_type); -} - -static void __exit nft_meta_module_exit(void) -{ - nft_unregister_expr(&nft_meta_type); -} - -module_init(nft_meta_module_init); -module_exit(nft_meta_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 1f36954c2ba9..c15807d10b91 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr, const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 5a3a52c71545..8a64db8f2e69 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,13 +24,11 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; + struct nft_set *map; }; -static void nft_ng_inc_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) { - struct nft_ng_inc *priv = nft_expr_priv(expr); u32 nval, oval; do { @@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); - regs->data[priv->dreg] = nval + priv->offset; + return nval + priv->offset; +} + +static void nft_ng_inc_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + + regs->data[priv->dreg] = nft_ng_inc_gen(priv); +} + +static void nft_ng_inc_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + const struct nft_set *map = priv->map; + const struct nft_set_ext *ext; + u32 result; + bool found; + + result = nft_ng_inc_gen(priv); + found = map->ops->lookup(nft_net(pkt), map, &result, &ext); + + if (!found) + return; + + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), map->dlen); } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { @@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, + [NFTA_NG_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, + [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -71,6 +101,25 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_ng_inc_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + + nft_ng_inc_init(ctx, expr, tb); + + priv->map = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_NG_SET_NAME], + tb[NFTA_NG_SET_ID], genmask); + + if (IS_ERR(priv->map)) + return PTR_ERR(priv->map); + + return 0; +} + static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -97,6 +146,22 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static int nft_ng_inc_map_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + if (nft_ng_dump(skb, priv->dreg, priv->modulus, + NFT_NG_INCREMENTAL, priv->offset) || + nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; @@ -156,6 +221,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; +static const struct nft_expr_ops nft_ng_inc_map_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), + .eval = nft_ng_inc_map_eval, + .init = nft_ng_inc_map_init, + .dump = nft_ng_inc_map_dump, +}; + static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -178,6 +251,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: + if (tb[NFTA_NG_SET_NAME]) + return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: return &nft_ng_random_ops; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 0b02407773ad..cdf348f751ec 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); - obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, - genmask); + obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + genmask); if (IS_ERR(obj)) return -ENOENT; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 11a2071b6dd4..76dba9f6b6f6 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -7,8 +7,6 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp return nft_chain_validate_hooks(ctx->chain, hooks); } -static struct nft_expr_type nft_rt_type; static const struct nft_expr_ops nft_rt_get_ops = { .type = &nft_rt_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)), @@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = { .validate = nft_rt_validate, }; -static struct nft_expr_type nft_rt_type __read_mostly = { +struct nft_expr_type nft_rt_type __read_mostly = { .name = "rt", .ops = &nft_rt_get_ops, .policy = nft_rt_policy, .maxattr = NFTA_RT_MAX, .owner = THIS_MODULE, }; - -static int __init nft_rt_module_init(void) -{ - return nft_register_expr(&nft_rt_type); -} - -static void __exit nft_rt_module_exit(void) -{ - nft_unregister_expr(&nft_rt_type); -} - -module_init(nft_rt_module_init); -module_exit(nft_rt_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>"); -MODULE_ALIAS_NFT_EXPR("rt"); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 45fb2752fb63..d6626e01c7ee 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type; -static struct nft_set_ops nft_bitmap_ops __read_mostly = { - .type = &nft_bitmap_type, - .privsize = nft_bitmap_privsize, - .elemsize = offsetof(struct nft_bitmap_elem, ext), - .estimate = nft_bitmap_estimate, - .init = nft_bitmap_init, - .destroy = nft_bitmap_destroy, - .insert = nft_bitmap_insert, - .remove = nft_bitmap_remove, - .deactivate = nft_bitmap_deactivate, - .flush = nft_bitmap_flush, - .activate = nft_bitmap_activate, - .lookup = nft_bitmap_lookup, - .walk = nft_bitmap_walk, - .get = nft_bitmap_get, -}; - static struct nft_set_type nft_bitmap_type __read_mostly = { - .ops = &nft_bitmap_ops, .owner = THIS_MODULE, + .ops = { + .privsize = nft_bitmap_privsize, + .elemsize = offsetof(struct nft_bitmap_elem, ext), + .estimate = nft_bitmap_estimate, + .init = nft_bitmap_init, + .destroy = nft_bitmap_destroy, + .insert = nft_bitmap_insert, + .remove = nft_bitmap_remove, + .deactivate = nft_bitmap_deactivate, + .flush = nft_bitmap_flush, + .activate = nft_bitmap_activate, + .lookup = nft_bitmap_lookup, + .walk = nft_bitmap_walk, + .get = nft_bitmap_get, + }, }; static int __init nft_bitmap_module_init(void) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index fc9c6d5d64cd..dbf1f4ad077c 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -605,6 +605,12 @@ static void nft_hash_destroy(const struct nft_set *set) static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (!desc->size) + return false; + + if (desc->klen == 4) + return false; + est->size = sizeof(struct nft_hash) + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + desc->size * sizeof(struct nft_hash_elem); @@ -614,91 +620,100 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_hash_type; -static struct nft_set_ops nft_rhash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_rhash_privsize, - .elemsize = offsetof(struct nft_rhash_elem, ext), - .estimate = nft_rhash_estimate, - .init = nft_rhash_init, - .destroy = nft_rhash_destroy, - .insert = nft_rhash_insert, - .activate = nft_rhash_activate, - .deactivate = nft_rhash_deactivate, - .flush = nft_rhash_flush, - .remove = nft_rhash_remove, - .lookup = nft_rhash_lookup, - .update = nft_rhash_update, - .walk = nft_rhash_walk, - .get = nft_rhash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, -}; +static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + if (!desc->size) + return false; -static struct nft_set_ops nft_hash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; + if (desc->klen != 4) + return false; -static struct nft_set_ops nft_hash_fast_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup_fast, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; - -static const struct nft_set_ops * -nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, - u32 flags) -{ - if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) { - switch (desc->klen) { - case 4: - return &nft_hash_fast_ops; - default: - return &nft_hash_ops; - } - } + est->size = sizeof(struct nft_hash) + + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + desc->size * sizeof(struct nft_hash_elem); + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; - return &nft_rhash_ops; + return true; } +static struct nft_set_type nft_rhash_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT | NFT_SET_EVAL, + .ops = { + .privsize = nft_rhash_privsize, + .elemsize = offsetof(struct nft_rhash_elem, ext), + .estimate = nft_rhash_estimate, + .init = nft_rhash_init, + .destroy = nft_rhash_destroy, + .insert = nft_rhash_insert, + .activate = nft_rhash_activate, + .deactivate = nft_rhash_deactivate, + .flush = nft_rhash_flush, + .remove = nft_rhash_remove, + .lookup = nft_rhash_lookup, + .update = nft_rhash_update, + .walk = nft_rhash_walk, + .get = nft_rhash_get, + }, +}; + static struct nft_set_type nft_hash_type __read_mostly = { - .select_ops = nft_hash_select_ops, .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, +}; + +static struct nft_set_type nft_hash_fast_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_fast_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup_fast, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_hash_type); + if (nft_register_set(&nft_hash_fast_type) || + nft_register_set(&nft_hash_type) || + nft_register_set(&nft_rhash_type)) + return 1; + return 0; } static void __exit nft_hash_module_exit(void) { + nft_unregister_set(&nft_rhash_type); nft_unregister_set(&nft_hash_type); + nft_unregister_set(&nft_hash_fast_type); } module_init(nft_hash_module_init); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e6f08bc5f359..22c57d7612c4 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -393,28 +393,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type; -static struct nft_set_ops nft_rbtree_ops __read_mostly = { - .type = &nft_rbtree_type, - .privsize = nft_rbtree_privsize, - .elemsize = offsetof(struct nft_rbtree_elem, ext), - .estimate = nft_rbtree_estimate, - .init = nft_rbtree_init, - .destroy = nft_rbtree_destroy, - .insert = nft_rbtree_insert, - .remove = nft_rbtree_remove, - .deactivate = nft_rbtree_deactivate, - .flush = nft_rbtree_flush, - .activate = nft_rbtree_activate, - .lookup = nft_rbtree_lookup, - .walk = nft_rbtree_walk, - .get = nft_rbtree_get, - .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, -}; - static struct nft_set_type nft_rbtree_type __read_mostly = { - .ops = &nft_rbtree_ops, .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_rbtree_privsize, + .elemsize = offsetof(struct nft_rbtree_elem, ext), + .estimate = nft_rbtree_estimate, + .init = nft_rbtree_init, + .destroy = nft_rbtree_destroy, + .insert = nft_rbtree_insert, + .remove = nft_rbtree_remove, + .deactivate = nft_rbtree_deactivate, + .flush = nft_rbtree_flush, + .activate = nft_rbtree_activate, + .lookup = nft_rbtree_lookup, + .walk = nft_rbtree_walk, + .get = nft_rbtree_get, + }, }; static int __init nft_rbtree_module_init(void) diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c index 58aa9dd3c5b7..1d437875e15a 100644 --- a/net/netfilter/xt_NETMAP.c +++ b/net/netfilter/xt_NETMAP.c @@ -21,8 +21,8 @@ static unsigned int netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - struct nf_nat_range newrange; + const struct nf_nat_range2 *range = par->targinfo; + struct nf_nat_range2 newrange; struct nf_conn *ct; enum ip_conntrack_info ctinfo; union nf_inet_addr new_addr, netmask; @@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; @@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && xt_hooknum(par) != NF_INET_POST_ROUTING && diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index c7f8958cea4a..1ed0cac585c4 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -13,7 +13,6 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> #include <net/netfilter/nf_log.h> -#include <net/netfilter/nfnetlink_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); @@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) if (info->flags & XT_NFLOG_F_COPY_LEN) li.u.ulog.flags |= NF_LOG_F_COPY_LEN; - nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb, - xt_in(par), xt_out(par), &li, info->prefix); + nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), + xt_out(par), &li, "%s", info->prefix); + return XT_CONTINUE; } @@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return 0; + + return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); +} + +static void nflog_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } static struct xt_target nflog_tg_reg __read_mostly = { @@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = { .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 98a4c6d4f1cb..5ce9461e979c 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index bdb689cdc829..8af9707f8789 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static void xt_nat_convert_range(struct nf_nat_range *dst, +static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; @@ -54,7 +55,7 @@ static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -71,7 +72,7 @@ static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); - return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); + + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY))); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, + { + .name = "SNAT", + .revision = 2, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, + .target = xt_snat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 2, + .target = xt_dnat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, }; static int __init xt_nat_init(void) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index a34f314a8c23..9cfef73b4107 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -37,21 +37,6 @@ #include <net/netfilter/nf_log.h> #include <linux/netfilter/xt_osf.h> -struct xt_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct xt_osf_user_finger finger; -}; - -enum osf_fmatch_states { - /* Packet does not match the fingerprint */ - FMATCH_WRONG = 0, - /* Packet matches the fingerprint */ - FMATCH_OK, - /* Options do not match the fingerprint, but header does */ - FMATCH_OPT_WRONG, -}; - /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. @@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = { .cb = xt_osf_nfnetlink_callbacks, }; -static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, - unsigned char f_ttl) -{ - const struct iphdr *ip = ip_hdr(skb); - - if (info->flags & XT_OSF_TTL) { - if (info->ttl == XT_OSF_TTL_TRUE) - return ip->ttl == f_ttl; - if (info->ttl == XT_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) - return 1; - else { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); - int ret = 0; - - for_ifa(in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } - } - endfor_ifa(in_dev); - - return ret; - } - } - - return ip->ttl == f_ttl; -} - static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { const struct xt_osf_info *info = p->matchinfo; - const struct iphdr *ip = ip_hdr(skb); - const struct tcphdr *tcp; - struct tcphdr _tcph; - int fmatch = FMATCH_WRONG, fcount = 0; - unsigned int optsize = 0, check_WSS = 0; - u16 window, totlen, mss = 0; - bool df; - const unsigned char *optp = NULL, *_optp = NULL; - unsigned char opts[MAX_IPOPTLEN]; - const struct xt_osf_finger *kf; - const struct xt_osf_user_finger *f; struct net *net = xt_net(p); if (!info) return false; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); - if (!tcp) - return false; - - if (!tcp->syn) - return false; - - totlen = ntohs(ip->tot_len); - df = ntohs(ip->frag_off) & IP_DF; - window = ntohs(tcp->window); - - if (tcp->doff * 4 > sizeof(struct tcphdr)) { - optsize = tcp->doff * 4 - sizeof(struct tcphdr); - - _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + - sizeof(struct tcphdr), optsize, opts); - } - - list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { - int foptsize, optnum; - - f = &kf->finger; - - if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) - continue; - - optp = _optp; - fmatch = FMATCH_WRONG; - - if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl)) - continue; - - /* - * Should not happen if userspace parser was written correctly. - */ - if (f->wss.wc >= OSF_WSS_MAX) - continue; - - /* Check options */ - - foptsize = 0; - for (optnum = 0; optnum < f->opt_num; ++optnum) - foptsize += f->opt[optnum].length; - - if (foptsize > MAX_IPOPTLEN || - optsize > MAX_IPOPTLEN || - optsize != foptsize) - continue; - - check_WSS = f->wss.wc; - - for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == (*optp)) { - __u32 len = f->opt[optnum].length; - const __u8 *optend = optp + len; - - fmatch = FMATCH_OK; - - switch (*optp) { - case OSFOPT_MSS: - mss = optp[3]; - mss <<= 8; - mss |= optp[2]; - - mss = ntohs((__force __be16)mss); - break; - case OSFOPT_TS: - break; - } - - optp = optend; - } else - fmatch = FMATCH_OPT_WRONG; - - if (fmatch != FMATCH_OK) - break; - } - - if (fmatch != FMATCH_OPT_WRONG) { - fmatch = FMATCH_WRONG; - - switch (check_WSS) { - case OSF_WSS_PLAIN: - if (f->wss.val == 0 || window == f->wss.val) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MSS: - /* - * Some smart modems decrease mangle MSS to - * SMART_MSS_2, so we check standard, decreased - * and the one provided in the fingerprint MSS - * values. - */ -#define SMART_MSS_1 1460 -#define SMART_MSS_2 1448 - if (window == f->wss.val * mss || - window == f->wss.val * SMART_MSS_1 || - window == f->wss.val * SMART_MSS_2) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MTU: - if (window == f->wss.val * (mss + 40) || - window == f->wss.val * (SMART_MSS_1 + 40) || - window == f->wss.val * (SMART_MSS_2 + 40)) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MODULO: - if ((window % f->wss.val) == 0) - fmatch = FMATCH_OK; - break; - } - } - - if (fmatch != FMATCH_OK) - continue; - - fcount++; - - if (info->flags & XT_OSF_LOG) - nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, - xt_in(p), xt_out(p), NULL, - "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", - f->genre, f->version, f->subtype, - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest), - f->ttl - ip->ttl); - - if ((info->flags & XT_OSF_LOG) && - info->loglevel == XT_OSF_LOGLEVEL_FIRST) - break; - } - - if (!fcount && (info->flags & XT_OSF_LOG)) - nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p), - xt_out(p), NULL, - "Remote OS is not known: %pI4:%u -> %pI4:%u\n", - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest)); - - if (fcount) - fmatch = FMATCH_OK; - - return fmatch == FMATCH_OK; + return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), + xt_out(p), info, net, xt_osf_fingers); } static struct xt_match xt_osf_match = { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c5904f629091..02fc343feb66 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -72,7 +72,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; #ifdef CONFIG_NF_NAT_NEEDED - struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ + struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; @@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net, */ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { int hooknum, nh_off, err = NF_ACCEPT; |