summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/tproxy.rst2
-rw-r--r--include/linux/netfilter.h4
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c10
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c5
-rw-r--r--net/netfilter/nf_conntrack_core.c141
-rw-r--r--net/netfilter/nf_conntrack_netlink.c9
-rw-r--r--net/netfilter/nf_nat_core.c121
-rw-r--r--net/netfilter/nf_tables_api.c6
-rw-r--r--net/netfilter/nft_compat.c6
-rw-r--r--net/netfilter/nft_log.c2
-rw-r--r--net/netfilter/nft_meta.c2
-rw-r--r--net/netfilter/nft_numgen.c2
-rw-r--r--net/netfilter/nft_set_pipapo.c13
-rw-r--r--net/netfilter/nft_tunnel.c5
-rw-r--r--tools/testing/selftests/net/netfilter/Makefile4
-rw-r--r--tools/testing/selftests/net/netfilter/config1
-rw-r--r--tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c125
-rwxr-xr-xtools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh51
-rwxr-xr-xtools/testing/selftests/net/netfilter/ipvs.sh2
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_queue.sh92
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh358
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_tproxy_udp.sh262
22 files changed, 1091 insertions, 132 deletions
diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst
index 00dc3a1a66b4..7f7c1ff6f159 100644
--- a/Documentation/networking/tproxy.rst
+++ b/Documentation/networking/tproxy.rst
@@ -17,7 +17,7 @@ The idea is that you identify packets with destination address matching a local
socket on your box, set the packet mark to a certain value::
# iptables -t mangle -N DIVERT
- # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
+ # iptables -t mangle -A PREROUTING -p tcp -m socket --transparent -j DIVERT
# iptables -t mangle -A DIVERT -j MARK --set-mark 1
# iptables -t mangle -A DIVERT -j ACCEPT
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2683b2b77612..2b8aac2c70ad 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;
-enum ip_conntrack_dir;
struct nf_nat_hook {
int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
const struct nlattr *attr);
void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
- unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
- enum nf_nat_manip_type mtype,
- enum ip_conntrack_dir dir);
void (*remove_nat_bysrc)(struct nf_conn *ct);
};
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 04504b2b51df..87fd945a0d27 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -239,9 +239,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
int hook)
{
- struct sk_buff *nskb;
- struct iphdr *niph;
const struct tcphdr *oth;
+ struct sk_buff *nskb;
struct tcphdr _oth;
oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
@@ -266,14 +265,12 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
- ip4_dst_hoplimit(skb_dst(nskb)));
+ nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
goto free_nskb;
- niph = ip_hdr(nskb);
-
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
goto free_nskb;
@@ -290,6 +287,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
*/
if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct iphdr *niph = ip_hdr(nskb);
struct net_device *br_indev;
br_indev = nf_bridge_get_physindev(oldskb, net);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index b9457473c176..7db0437140bf 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -273,7 +273,6 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
const struct tcphdr *otcph;
unsigned int otcplen, hh_len;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
- struct ipv6hdr *ip6h;
struct dst_entry *dst = NULL;
struct flowi6 fl6;
@@ -329,8 +328,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
nskb->mark = fl6.flowi6_mark;
skb_reserve(nskb, hh_len + dst->header_len);
- ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
- ip6_dst_hoplimit(dst));
+ nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst));
nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
nf_ct_attach(nskb, oldskb);
@@ -345,6 +343,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
*/
if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct ipv6hdr *ip6h = ipv6_hdr(nskb);
struct net_device *br_indev;
br_indev = nf_bridge_get_physindev(oldskb, net);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d3cb53b008f5..9db3e2b0b1c3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -988,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/**
+ * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
+ * @ct1: conntrack in hash table to check against
+ * @ct2: merge candidate
+ *
+ * returns true if ct1 and ct2 happen to refer to the same flow, but
+ * in opposing directions, i.e.
+ * ct1: a:b -> c:d
+ * ct2: c:d -> a:b
+ * for both directions. If so, @ct2 should not have been created
+ * as the skb should have been picked up as ESTABLISHED flow.
+ * But ct1 was not yet committed to hash table before skb that created
+ * ct2 had arrived.
+ *
+ * Note we don't compare netns because ct entries in different net
+ * namespace cannot clash to begin with.
+ *
+ * @return: true if ct1 and ct2 are identical when swapping origin/reply.
+ */
+static bool
+nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ u16 id1, id2;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
+ return false;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
+ if (id1 != id2)
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);
+
+ return id1 == id2;
+}
+
+static int nf_ct_can_merge(const struct nf_conn *ct,
+ const struct nf_conn *loser_ct)
+{
+ return nf_ct_match(ct, loser_ct) ||
+ nf_ct_match_reverse(ct, loser_ct);
+}
+
/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
@@ -999,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
loser_ct = nf_ct_get(skb, &ctinfo);
- if (nf_ct_is_dying(ct))
- return NF_DROP;
-
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
+ if (nf_ct_can_merge(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
nf_conntrack_get(&ct->ct_general);
@@ -2151,80 +2197,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
-static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- const struct nf_nat_hook *nat_hook;
- struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
- unsigned int status;
- int dataoff;
- u16 l3num;
- u8 l4num;
-
- l3num = nf_ct_l3num(ct);
-
- dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
- if (dataoff <= 0)
- return NF_DROP;
-
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple))
- return NF_DROP;
-
- if (ct->status & IPS_SRC_NAT) {
- memcpy(tuple.src.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
- sizeof(tuple.src.u3.all));
- tuple.src.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
- }
-
- if (ct->status & IPS_DST_NAT) {
- memcpy(tuple.dst.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
- sizeof(tuple.dst.u3.all));
- tuple.dst.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
- }
-
- h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
- if (!h)
- return NF_ACCEPT;
-
- /* Store status bits of the conntrack that is clashing to re-do NAT
- * mangling according to what it has been done already to this packet.
- */
- status = ct->status;
-
- nf_ct_put(ct);
- ct = nf_ct_tuplehash_to_ctrack(h);
- nf_ct_set(skb, ct, ctinfo);
-
- nat_hook = rcu_dereference(nf_nat_hook);
- if (!nat_hook)
- return NF_ACCEPT;
-
- if (status & IPS_SRC_NAT) {
- unsigned int verdict = nat_hook->manip_pkt(skb, ct,
- NF_NAT_MANIP_SRC,
- IP_CT_DIR_ORIGINAL);
- if (verdict != NF_ACCEPT)
- return verdict;
- }
-
- if (status & IPS_DST_NAT) {
- unsigned int verdict = nat_hook->manip_pkt(skb, ct,
- NF_NAT_MANIP_DST,
- IP_CT_DIR_ORIGINAL);
- if (verdict != NF_ACCEPT)
- return verdict;
- }
-
- return NF_ACCEPT;
-}
-
/* This packet is coming from userspace via nf_queue, complete the packet
* processing after the helper invocation in nf_confirm().
*/
@@ -2288,17 +2260,6 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
if (!ct)
return NF_ACCEPT;
- if (!nf_ct_is_confirmed(ct)) {
- int ret = __nf_conntrack_update(net, skb, ct, ctinfo);
-
- if (ret != NF_ACCEPT)
- return ret;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return NF_ACCEPT;
- }
-
return nf_confirm_cthelper(skb, ct, ctinfo);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 123e2e933e9b..6a1239433830 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -382,7 +382,7 @@ nla_put_failure:
#define ctnetlink_dump_secctx(a, b) (0)
#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -391,6 +391,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
return 0;
return nla_total_size(sizeof(labels->bits));
}
+#endif
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
@@ -411,10 +412,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-#else
-#define ctnetlink_dump_labels(a, b) (0)
-#define ctnetlink_label_size(a) (0)
-#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -652,7 +649,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
return len + len4;
}
-#endif
static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
@@ -690,6 +686,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
return 0;
#endif
}
+#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6d8da6dddf99..4085c436e306 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -183,7 +183,35 @@ hash_by_src(const struct net *net,
return reciprocal_scale(hash, nf_nat_htable_size);
}
-/* Is this tuple already taken? (not by us) */
+/**
+ * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_conntrack: our (unconfirmed) conntrack entry
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple in either direction.
+ *
+ * Example:
+ * INITIATOR -> NAT/PAT -> RESPONDER
+ *
+ * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite).
+ * Then, later, NAT/PAT itself also connects to RESPONDER.
+ *
+ * This will not work if the SNAT done earlier has same IP:PORT source pair.
+ *
+ * Conntrack table has:
+ * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * and new locally originating connection wants:
+ * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * ... which would mean incoming packets cannot be distinguished between
+ * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple).
+ *
+ * @return: true if the proposed NAT mapping collides with an existing entry.
+ */
static int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
@@ -200,6 +228,94 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}
+static bool nf_nat_allow_clash(const struct nf_conn *ct)
+{
+ return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash;
+}
+
+/**
+ * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_ct: our (unconfirmed) conntrack entry
+ *
+ * Same as nf_nat_used_tuple, but also check for rare clash in reverse
+ * direction. Should be called only when @tuple has not been altered, i.e.
+ * @ignored_conntrack will not be subject to NAT.
+ *
+ * @return: true if the proposed NAT mapping collides with existing entry.
+ */
+static noinline bool
+nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_ct)
+{
+ static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT;
+ const struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ if (!nf_nat_used_tuple(tuple, ignored_ct))
+ return false;
+
+ if (!nf_nat_allow_clash(ignored_ct))
+ return true;
+
+ /* Initial choice clashes with existing conntrack.
+ * Check for (rare) reverse collision.
+ *
+ * This can happen when new packets are received in both directions
+ * at the exact same time on different CPUs.
+ *
+ * Without SMP, first packet creates new conntrack entry and second
+ * packet is resolved as established reply packet.
+ *
+ * With parallel processing, both packets could be picked up as
+ * new and both get their own ct entry allocated.
+ *
+ * If ignored_conntrack and colliding ct are not subject to NAT then
+ * pretend the tuple is available and let later clash resolution
+ * handle this at insertion time.
+ *
+ * Without it, the 'reply' packet has its source port rewritten
+ * by nat engine.
+ */
+ if (READ_ONCE(ignored_ct->status) & uses_nat)
+ return true;
+
+ net = nf_ct_net(ignored_ct);
+ zone = nf_ct_zone(ignored_ct);
+
+ thash = nf_conntrack_find_get(net, zone, tuple);
+ if (unlikely(!thash)) /* clashing entry went away */
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ /* NB: IP_CT_DIR_ORIGINAL should be impossible because
+ * nf_nat_used_tuple() handles origin collisions.
+ *
+ * Handle remote chance other CPU confirmed its ct right after.
+ */
+ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY)
+ goto out;
+
+ /* clashing connection subject to NAT? Retry with new tuple. */
+ if (READ_ONCE(ct->status) & uses_nat)
+ goto out;
+
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+ taken = false;
+ goto out;
+ }
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
{
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
@@ -611,7 +727,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
if (nf_in_range(orig_tuple, range)) {
- if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ if (!nf_nat_used_tuple_new(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
@@ -1208,7 +1324,6 @@ static const struct nf_nat_hook nat_hook = {
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
- .manip_pkt = nf_nat_manip_pkt,
.remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 57259b5f3ef5..a24fe62650a7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1849,7 +1849,7 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
if (!hook_list)
hook_list = &basechain->hook_list;
- list_for_each_entry(hook, hook_list, list) {
+ list_for_each_entry_rcu(hook, hook_list, list) {
if (!first)
first = hook;
@@ -6684,7 +6684,7 @@ static int nft_setelem_catchall_insert(const struct net *net,
}
}
- catchall = kmalloc(sizeof(*catchall), GFP_KERNEL);
+ catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT);
if (!catchall)
return -ENOMEM;
@@ -9207,7 +9207,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
FLOW_BLOCK_UNBIND);
list_del_rcu(&hook->list);
- kfree(hook);
+ kfree_rcu(hook, rcu);
}
kfree(flowtable->name);
module_put(flowtable->data.type->owner);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 52cdfee17f73..7ca4f0d21fe2 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -535,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_match *m = expr->ops->data;
int ret;
- priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+ priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT);
if (!priv->info)
return -ENOMEM;
@@ -808,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
@@ -898,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 5defe6e4fd98..e35588137995 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -163,7 +163,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
- priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT);
if (priv->prefix == NULL)
return -ENOMEM;
nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8c8eb14d647b..05cd1e6e6a2f 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -952,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_SECMARK_CTX] == NULL)
return -EINVAL;
- priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+ priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT);
if (!priv->ctx)
return -ENOMEM;
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 7d29db7c2ac0..bd058babfc82 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -66,7 +66,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT);
if (!priv->counter)
return -ENOMEM;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index eb4c4a4ac7ac..7be342b495f5 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -663,7 +663,7 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f,
check_add_overflow(rules, extra, &rules_alloc))
return -EOVERFLOW;
- new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL);
+ new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT);
if (!new_mt)
return -ENOMEM;
@@ -936,7 +936,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
return;
}
- new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return;
@@ -1212,7 +1212,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
scratch = kzalloc_node(struct_size(scratch, map,
bsize_max * 2) +
NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL, cpu_to_node(i));
+ GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
* allocations: this means that some scratch maps have
@@ -1427,7 +1427,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
struct nft_pipapo_match *new;
int i;
- new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL);
+ new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT);
if (!new)
return NULL;
@@ -1457,7 +1457,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
src->bsize * sizeof(*dst->lt) +
NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new_lt)
goto out_lt;
@@ -1470,7 +1470,8 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (src->rules > 0) {
dst->mt = kvmalloc_array(src->rules_alloc,
- sizeof(*src->mt), GFP_KERNEL);
+ sizeof(*src->mt),
+ GFP_KERNEL_ACCOUNT);
if (!dst->mt)
goto out_mt;
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 60a76e6e348e..5c6ed68cc6e0 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -509,13 +509,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return err;
}
- md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL,
+ GFP_KERNEL_ACCOUNT);
if (!md)
return -ENOMEM;
memcpy(&md->u.tun_info, &info, sizeof(info));
#ifdef CONFIG_DST_CACHE
- err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL);
+ err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT);
if (err < 0) {
metadata_dst_free(md);
return err;
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index d13fb5ea3e89..e6c9e777fead 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -13,6 +13,7 @@ TEST_PROGS += conntrack_ipip_mtu.sh
TEST_PROGS += conntrack_tcp_unreplied.sh
TEST_PROGS += conntrack_sctp_collision.sh
TEST_PROGS += conntrack_vrf.sh
+TEST_PROGS += conntrack_reverse_clash.sh
TEST_PROGS += ipvs.sh
TEST_PROGS += nf_conntrack_packetdrill.sh
TEST_PROGS += nf_nat_edemux.sh
@@ -26,6 +27,8 @@ TEST_PROGS += nft_nat.sh
TEST_PROGS += nft_nat_zones.sh
TEST_PROGS += nft_queue.sh
TEST_PROGS += nft_synproxy.sh
+TEST_PROGS += nft_tproxy_tcp.sh
+TEST_PROGS += nft_tproxy_udp.sh
TEST_PROGS += nft_zones_many.sh
TEST_PROGS += rpath.sh
TEST_PROGS += xt_string.sh
@@ -36,6 +39,7 @@ TEST_GEN_PROGS = conntrack_dump_flush
TEST_GEN_FILES = audit_logread
TEST_GEN_FILES += connect_close nf_queue
+TEST_GEN_FILES += conntrack_reverse_clash
TEST_GEN_FILES += sctp_collision
include ../../lib.mk
diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
index b2dd4db45215..c5fe7b34eaf1 100644
--- a/tools/testing/selftests/net/netfilter/config
+++ b/tools/testing/selftests/net/netfilter/config
@@ -81,6 +81,7 @@ CONFIG_NFT_QUEUE=m
CONFIG_NFT_QUOTA=m
CONFIG_NFT_REDIR=m
CONFIG_NFT_SYNPROXY=m
+CONFIG_NFT_TPROXY=m
CONFIG_VETH=m
CONFIG_VLAN_8021Q=m
CONFIG_XFRM_USER=m
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
new file mode 100644
index 000000000000..507930cee8cb
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Needs something like:
+ *
+ * iptables -t nat -A POSTROUTING -o nomatch -j MASQUERADE
+ *
+ * so NAT engine attaches a NAT null-binding to each connection.
+ *
+ * With unmodified kernels, child or parent will exit with
+ * "Port number changed" error, even though no port translation
+ * was requested.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+
+#define LEN 512
+#define PORT 56789
+#define TEST_TIME 5
+
+static void die(const char *e)
+{
+ perror(e);
+ exit(111);
+}
+
+static void die_port(uint16_t got, uint16_t want)
+{
+ fprintf(stderr, "Port number changed, wanted %d got %d\n", want, ntohs(got));
+ exit(1);
+}
+
+static int udp_socket(void)
+{
+ static const struct timeval tv = {
+ .tv_sec = 1,
+ };
+ int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+
+ if (fd < 0)
+ die("socket");
+
+ setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+ return fd;
+}
+
+int main(int argc, char *argv[])
+{
+ struct sockaddr_in sa1 = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in sa2 = {
+ .sin_family = AF_INET,
+ };
+ int s1, s2, status;
+ time_t end, now;
+ socklen_t plen;
+ char buf[LEN];
+ bool child;
+
+ sa1.sin_port = htons(PORT);
+ sa2.sin_port = htons(PORT + 1);
+
+ s1 = udp_socket();
+ s2 = udp_socket();
+
+ inet_pton(AF_INET, "127.0.0.11", &sa1.sin_addr);
+ inet_pton(AF_INET, "127.0.0.12", &sa2.sin_addr);
+
+ if (bind(s1, (struct sockaddr *)&sa1, sizeof(sa1)) < 0)
+ die("bind 1");
+ if (bind(s2, (struct sockaddr *)&sa2, sizeof(sa2)) < 0)
+ die("bind 2");
+
+ child = fork() == 0;
+
+ now = time(NULL);
+ end = now + TEST_TIME;
+
+ while (now < end) {
+ struct sockaddr_in peer;
+ socklen_t plen = sizeof(peer);
+
+ now = time(NULL);
+
+ if (child) {
+ if (sendto(s1, buf, LEN, 0, (struct sockaddr *)&sa2, sizeof(sa2)) != LEN)
+ continue;
+
+ if (recvfrom(s2, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+ die("child recvfrom");
+
+ if (peer.sin_port != htons(PORT))
+ die_port(peer.sin_port, PORT);
+ } else {
+ if (sendto(s2, buf, LEN, 0, (struct sockaddr *)&sa1, sizeof(sa1)) != LEN)
+ continue;
+
+ if (recvfrom(s1, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0)
+ die("parent recvfrom");
+
+ if (peer.sin_port != htons((PORT + 1)))
+ die_port(peer.sin_port, PORT + 1);
+ }
+ }
+
+ if (child)
+ return 0;
+
+ wait(&status);
+
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
new file mode 100755
index 000000000000..a24c896347a8
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+cleanup()
+{
+ cleanup_all_ns
+}
+
+checktool "nft --version" "run test without nft"
+checktool "conntrack --version" "run test without conntrack"
+
+trap cleanup EXIT
+
+setup_ns ns0
+
+# make loopback connections get nat null bindings assigned
+ip netns exec "$ns0" nft -f - <<EOF
+table ip nat {
+ chain POSTROUTING {
+ type nat hook postrouting priority srcnat; policy accept;
+ oifname "nomatch" counter packets 0 bytes 0 masquerade
+ }
+}
+EOF
+
+do_flush()
+{
+ local end
+ local now
+
+ now=$(date +%s)
+ end=$((now + 5))
+
+ while [ $now -lt $end ];do
+ ip netns exec "$ns0" conntrack -F 2>/dev/null
+ now=$(date +%s)
+ done
+}
+
+do_flush &
+
+if ip netns exec "$ns0" ./conntrack_reverse_clash; then
+ echo "PASS: No SNAT performed for null bindings"
+else
+ echo "ERROR: SNAT performed without any matching snat rule"
+ exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/net/netfilter/ipvs.sh b/tools/testing/selftests/net/netfilter/ipvs.sh
index 4ceee9fb3949..d3edb16cd4b3 100755
--- a/tools/testing/selftests/net/netfilter/ipvs.sh
+++ b/tools/testing/selftests/net/netfilter/ipvs.sh
@@ -97,7 +97,7 @@ cleanup() {
}
server_listen() {
- ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
+ ip netns exec "$ns2" timeout 5 socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" &
server_pid=$!
sleep 0.2
}
diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index d66e3c4dfec6..a9d109fcc15c 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -31,7 +31,7 @@ modprobe -q sctp
trap cleanup EXIT
-setup_ns ns1 ns2 nsrouter
+setup_ns ns1 ns2 ns3 nsrouter
TMPFILE0=$(mktemp)
TMPFILE1=$(mktemp)
@@ -48,6 +48,7 @@ if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" >
exit $ksft_skip
fi
ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
ip -net "$nsrouter" link set veth0 up
ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
@@ -57,8 +58,13 @@ ip -net "$nsrouter" link set veth1 up
ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
ip -net "$ns1" link set eth0 up
ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
@@ -70,6 +76,11 @@ ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
ip -net "$ns2" route add default via 10.0.2.1
ip -net "$ns2" route add default via dead:2::1
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
load_ruleset() {
local name=$1
local prio=$2
@@ -473,6 +484,83 @@ EOF
check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output"
}
+udp_listener_ready()
+{
+ ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
+}
+
+output_files_written()
+{
+ test -s "$1" && test -s "$2"
+}
+
+test_udp_ct_race()
+{
+ ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet udpq {
+ chain prerouting {
+ type nat hook prerouting priority dstnat - 5; policy accept;
+ ip daddr 10.6.6.6 udp dport 12345 counter dnat to numgen inc mod 2 map { 0 : 10.0.2.99, 1 : 10.0.3.99 }
+ }
+ chain postrouting {
+ type filter hook postrouting priority srcnat - 5; policy accept;
+ udp dport 12345 counter queue num 12
+ }
+}
+EOF
+ :> "$TMPFILE1"
+ :> "$TMPFILE2"
+
+ timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE1",trunc &
+ local rpid1=$!
+
+ timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE2",trunc &
+ local rpid2=$!
+
+ ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
+ local nfqpid=$!
+
+ busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
+ busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
+ busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
+
+ # Send two packets, one should end up in ns1, other in ns2.
+ # This is because nfqueue will delay packet for long enough so that
+ # second packet will not find existing conntrack entry.
+ echo "Packet 1" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+ echo "Packet 2" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221
+
+ busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
+
+ kill "$nfqpid"
+
+ if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
+ echo "FAIL: Expected One udp conntrack entry"
+ ip netns exec "$nsrouter" conntrack -L -p udp --dport 12345
+ ret=1
+ fi
+
+ if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
+ echo "FAIL: Could not delete udpq table"
+ ret=1
+ return
+ fi
+
+ NUMLINES1=$(wc -l < "$TMPFILE1")
+ NUMLINES2=$(wc -l < "$TMPFILE2")
+
+ if [ "$NUMLINES1" -ne 1 ] || [ "$NUMLINES2" -ne 1 ]; then
+ ret=1
+ echo "FAIL: uneven udp packet distribution: $NUMLINES1 $NUMLINES2"
+ echo -n "$TMPFILE1: ";cat "$TMPFILE1"
+ echo -n "$TMPFILE2: ";cat "$TMPFILE2"
+ return
+ fi
+
+ echo "PASS: both udp receivers got one packet each"
+}
+
test_queue_removal()
{
read tainted_then < /proc/sys/kernel/tainted
@@ -512,6 +600,7 @@ EOF
ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
load_ruleset "filter" 0
@@ -549,6 +638,7 @@ test_tcp_localhost_connectclose
test_tcp_localhost_requeue
test_sctp_forward
test_sctp_output
+test_udp_ct_race
# should be last, adds vrf device in ns1 and changes routes
test_icmp_vrf
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
new file mode 100755
index 000000000000..e208fb03eeb7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+# +------------+
+# +-------+ | nsrouter | +-------+
+# |ns1 |.99 .1| |.1 .99| ns2|
+# | eth0|---------------|veth0 veth1|------------------|eth0 |
+# | | 10.0.1.0/24 | | 10.0.2.0/24 | |
+# +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+
+# +------------+
+# |.1
+# |
+# |
+# | +-------+
+# | .99| ns3|
+# +------------------------|eth0 |
+# 10.0.3.0/24 | |
+# dead:3::/64 +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+#
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+timeout=5
+
+cleanup()
+{
+ ip netns pids "$ns1" | xargs kill 2>/dev/null
+ ip netns pids "$ns2" | xargs kill 2>/dev/null
+ ip netns pids "$ns3" | xargs kill 2>/dev/null
+ ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+ cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+ return 1
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+ return 2
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+ return 1
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+ return 2
+ fi
+
+ return 0
+}
+
+test_ping_router() {
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+ return 3
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+ return 4
+ fi
+
+ return 0
+}
+
+
+listener_ready()
+{
+ local ns="$1"
+ local port="$2"
+ local proto="$3"
+ ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy()
+{
+ local traffic_origin="$1"
+ local ip_proto="$2"
+ local expect_ns1_ns2="$3"
+ local expect_ns1_ns3="$4"
+ local expect_nsrouter_ns2="$5"
+ local expect_nsrouter_ns3="$6"
+
+ # derived variables
+ local testname="test_${ip_proto}_tcp_${traffic_origin}"
+ local socat_ipproto
+ local ns1_ip
+ local ns2_ip
+ local ns3_ip
+ local ns2_target
+ local ns3_target
+ local nftables_subject
+ local ip_command
+
+ # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+ case $ip_proto in
+ "ip")
+ socat_ipproto="-4"
+ ns1_ip=10.0.1.99
+ ns2_ip=10.0.2.99
+ ns3_ip=10.0.3.99
+ ns2_target="tcp:$ns2_ip:8080"
+ ns3_target="tcp:$ns3_ip:8080"
+ nftables_subject="ip daddr $ns2_ip tcp dport 8080"
+ ip_command="ip"
+ ;;
+ "ip6")
+ socat_ipproto="-6"
+ ns1_ip=dead:1::99
+ ns2_ip=dead:2::99
+ ns3_ip=dead:3::99
+ ns2_target="tcp:[$ns2_ip]:8080"
+ ns3_target="tcp:[$ns3_ip]:8080"
+ nftables_subject="ip6 daddr $ns2_ip tcp dport 8080"
+ ip_command="ip -6"
+ ;;
+ *)
+ echo "FAIL: unsupported protocol"
+ exit 255
+ ;;
+ esac
+
+ case $traffic_origin in
+ # to capture the local originated traffic we need to mark the outgoing
+ # traffic so the policy based routing rule redirects it and can be processed
+ # in the prerouting chain.
+ "local")
+ nftables_rules="
+flush ruleset
+table inet filter {
+ chain divert {
+ type filter hook prerouting priority 0; policy accept;
+ $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+ }
+ chain output {
+ type route hook output priority 0; policy accept;
+ $nftables_subject meta mark set 1 accept
+ }
+}"
+ ;;
+ "forward")
+ nftables_rules="
+flush ruleset
+table inet filter {
+ chain divert {
+ type filter hook prerouting priority 0; policy accept;
+ $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept
+ }
+}"
+ ;;
+ *)
+ echo "FAIL: unsupported parameter for traffic origin"
+ exit 255
+ ;;
+ esac
+
+ # shellcheck disable=SC2046 # Intended splitting of ip_command
+ ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+ ip netns exec "$nsrouter" $ip_command route add local "${ns2_ip}" dev lo table 100
+ echo "$nftables_rules" | ip netns exec "$nsrouter" nft -f /dev/stdin
+
+ timeout "$timeout" ip netns exec "$nsrouter" socat "$socat_ipproto" tcp-listen:12345,fork,ip-transparent SYSTEM:"cat" 2>/dev/null &
+ local tproxy_pid=$!
+
+ timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+ local server2_pid=$!
+
+ timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+ local server3_pid=$!
+
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-t"
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-t"
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-t"
+
+ local result
+ # request from ns1 to ns2 (forwarded traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns2_target")
+ if [ "$result" == "$expect_ns1_ns2" ] ;then
+ echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+ else
+ echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+ ret=1
+ fi
+
+ # request from ns1 to ns3(forwarded traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns3_target")
+ if [ "$result" = "$expect_ns1_ns3" ] ;then
+ echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+ else
+ echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+ ret=1
+ fi
+
+ # request from nsrouter to ns2 (localy originated traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns2_target")
+ if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+ echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+ else
+ echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+ ret=1
+ fi
+
+ # request from nsrouter to ns3 (localy originated traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns3_target")
+ if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+ echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+ else
+ echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended"
+ ret=1
+ fi
+
+ # cleanup
+ kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+ # shellcheck disable=SC2046 # Intended splitting of ip_command
+ ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+ ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+test_ipv4_tcp_forward()
+{
+ local traffic_origin="forward"
+ local ip_proto="ip"
+ local expect_ns1_ns2="I_M_PROXIED"
+ local expect_ns1_ns3="PONG_NS3"
+ local expect_nsrouter_ns2="PONG_NS2"
+ local expect_nsrouter_ns3="PONG_NS3"
+
+ test_tproxy "$traffic_origin" \
+ "$ip_proto" \
+ "$expect_ns1_ns2" \
+ "$expect_ns1_ns3" \
+ "$expect_nsrouter_ns2" \
+ "$expect_nsrouter_ns3"
+}
+
+test_ipv4_tcp_local()
+{
+ local traffic_origin="local"
+ local ip_proto="ip"
+ local expect_ns1_ns2="I_M_PROXIED"
+ local expect_ns1_ns3="PONG_NS3"
+ local expect_nsrouter_ns2="I_M_PROXIED"
+ local expect_nsrouter_ns3="PONG_NS3"
+
+ test_tproxy "$traffic_origin" \
+ "$ip_proto" \
+ "$expect_ns1_ns2" \
+ "$expect_ns1_ns3" \
+ "$expect_nsrouter_ns2" \
+ "$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_forward()
+{
+ local traffic_origin="forward"
+ local ip_proto="ip6"
+ local expect_ns1_ns2="I_M_PROXIED"
+ local expect_ns1_ns3="PONG_NS3"
+ local expect_nsrouter_ns2="PONG_NS2"
+ local expect_nsrouter_ns3="PONG_NS3"
+
+ test_tproxy "$traffic_origin" \
+ "$ip_proto" \
+ "$expect_ns1_ns2" \
+ "$expect_ns1_ns3" \
+ "$expect_nsrouter_ns2" \
+ "$expect_nsrouter_ns3"
+}
+
+test_ipv6_tcp_local()
+{
+ local traffic_origin="local"
+ local ip_proto="ip6"
+ local expect_ns1_ns2="I_M_PROXIED"
+ local expect_ns1_ns3="PONG_NS3"
+ local expect_nsrouter_ns2="I_M_PROXIED"
+ local expect_nsrouter_ns3="PONG_NS3"
+
+ test_tproxy "$traffic_origin" \
+ "$ip_proto" \
+ "$expect_ns1_ns2" \
+ "$expect_ns1_ns3" \
+ "$expect_nsrouter_ns2" \
+ "$expect_nsrouter_ns3"
+}
+
+if test_ping; then
+ # queue bypass works (rules were skipped, no listener)
+ echo "PASS: ${ns1} can reach ${ns2}"
+else
+ echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+ exit $ret
+fi
+
+test_ipv4_tcp_forward
+test_ipv4_tcp_local
+test_ipv6_tcp_forward
+test_ipv6_tcp_local
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
new file mode 100755
index 000000000000..d16de13fe5a7
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+#
+# This tests tproxy on the following scenario:
+#
+# +------------+
+# +-------+ | nsrouter | +-------+
+# |ns1 |.99 .1| |.1 .99| ns2|
+# | eth0|---------------|veth0 veth1|------------------|eth0 |
+# | | 10.0.1.0/24 | | 10.0.2.0/24 | |
+# +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+
+# +------------+
+# |.1
+# |
+# |
+# | +-------+
+# | .99| ns3|
+# +------------------------|eth0 |
+# 10.0.3.0/24 | |
+# dead:3::/64 +-------+
+#
+# The tproxy implementation acts as an echo server so the client
+# must receive the same message it sent if it has been proxied.
+# If is not proxied the servers return PONG_NS# with the number
+# of the namespace the server is running.
+# shellcheck disable=SC2162,SC2317
+
+source lib.sh
+ret=0
+# UDP is slow
+timeout=15
+
+cleanup()
+{
+ ip netns pids "$ns1" | xargs kill 2>/dev/null
+ ip netns pids "$ns2" | xargs kill 2>/dev/null
+ ip netns pids "$ns3" | xargs kill 2>/dev/null
+ ip netns pids "$nsrouter" | xargs kill 2>/dev/null
+
+ cleanup_all_ns
+}
+
+checktool "nft --version" "test without nft tool"
+checktool "socat -h" "run test without socat"
+
+trap cleanup EXIT
+setup_ns ns1 ns2 ns3 nsrouter
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2"
+ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3"
+
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0
+ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad
+
+ip -net "$nsrouter" link set veth1 up
+ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1
+ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad
+
+ip -net "$nsrouter" link set veth2 up
+ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2
+ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad
+
+ip -net "$ns1" link set eth0 up
+ip -net "$ns2" link set eth0 up
+ip -net "$ns3" link set eth0 up
+
+ip -net "$ns1" addr add 10.0.1.99/24 dev eth0
+ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" route add default via 10.0.1.1
+ip -net "$ns1" route add default via dead:1::1
+
+ip -net "$ns2" addr add 10.0.2.99/24 dev eth0
+ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad
+ip -net "$ns2" route add default via 10.0.2.1
+ip -net "$ns2" route add default via dead:2::1
+
+ip -net "$ns3" addr add 10.0.3.99/24 dev eth0
+ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad
+ip -net "$ns3" route add default via 10.0.3.1
+ip -net "$ns3" route add default via dead:3::1
+
+ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null
+
+test_ping() {
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then
+ return 1
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then
+ return 2
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then
+ return 1
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then
+ return 2
+ fi
+
+ return 0
+}
+
+test_ping_router() {
+ if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then
+ return 3
+ fi
+
+ if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then
+ return 4
+ fi
+
+ return 0
+}
+
+
+listener_ready()
+{
+ local ns="$1"
+ local port="$2"
+ local proto="$3"
+ ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port"
+}
+
+test_tproxy_udp_forward()
+{
+ local ip_proto="$1"
+
+ local expect_ns1_ns2="I_M_PROXIED"
+ local expect_ns1_ns3="PONG_NS3"
+ local expect_nsrouter_ns2="PONG_NS2"
+ local expect_nsrouter_ns3="PONG_NS3"
+
+ # derived variables
+ local testname="test_${ip_proto}_udp_forward"
+ local socat_ipproto
+ local ns1_ip
+ local ns2_ip
+ local ns3_ip
+ local ns1_ip_port
+ local ns2_ip_port
+ local ns3_ip_port
+ local ip_command
+
+ # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1)
+ case $ip_proto in
+ "ip")
+ socat_ipproto="-4"
+ ns1_ip=10.0.1.99
+ ns2_ip=10.0.2.99
+ ns3_ip=10.0.3.99
+ ns1_ip_port="$ns1_ip:18888"
+ ns2_ip_port="$ns2_ip:8080"
+ ns3_ip_port="$ns3_ip:8080"
+ ip_command="ip"
+ ;;
+ "ip6")
+ socat_ipproto="-6"
+ ns1_ip=dead:1::99
+ ns2_ip=dead:2::99
+ ns3_ip=dead:3::99
+ ns1_ip_port="[$ns1_ip]:18888"
+ ns2_ip_port="[$ns2_ip]:8080"
+ ns3_ip_port="[$ns3_ip]:8080"
+ ip_command="ip -6"
+ ;;
+ *)
+ echo "FAIL: unsupported protocol"
+ exit 255
+ ;;
+ esac
+
+ # shellcheck disable=SC2046 # Intended splitting of ip_command
+ ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100
+ ip netns exec "$nsrouter" $ip_command route add local "$ns2_ip" dev lo table 100
+ ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+ chain divert {
+ type filter hook prerouting priority 0; policy accept;
+ $ip_proto daddr $ns2_ip udp dport 8080 tproxy $ip_proto to :12345 meta mark set 1 accept
+ }
+}
+EOF
+
+ timeout "$timeout" ip netns exec "$nsrouter" socat -u "$socat_ipproto" udp-listen:12345,fork,ip-transparent,reuseport udp:"$ns1_ip_port",ip-transparent,reuseport,bind="$ns2_ip_port" 2>/dev/null &
+ local tproxy_pid=$!
+
+ timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null &
+ local server2_pid=$!
+
+ timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null &
+ local server3_pid=$!
+
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-u"
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u"
+ busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u"
+
+ local result
+ # request from ns1 to ns2 (forwarded traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888)
+ if [ "$result" == "$expect_ns1_ns2" ] ;then
+ echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2"
+ else
+ echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended"
+ ret=1
+ fi
+
+ # request from ns1 to ns3 (forwarded traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+ if [ "$result" = "$expect_ns1_ns3" ] ;then
+ echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3"
+ else
+ echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended"
+ ret=1
+ fi
+
+ # request from nsrouter to ns2 (localy originated traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port")
+ if [ "$result" == "$expect_nsrouter_ns2" ] ;then
+ echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2"
+ else
+ echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended"
+ ret=1
+ fi
+
+ # request from nsrouter to ns3 (localy originated traffic)
+ result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port")
+ if [ "$result" = "$expect_nsrouter_ns3" ] ;then
+ echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3"
+ else
+ echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended"
+ ret=1
+ fi
+
+ # cleanup
+ kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null
+ # shellcheck disable=SC2046 # Intended splitting of ip_command
+ ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100
+ ip netns exec "$nsrouter" $ip_command route flush table 100
+}
+
+
+if test_ping; then
+ # queue bypass works (rules were skipped, no listener)
+ echo "PASS: ${ns1} can reach ${ns2}"
+else
+ echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+ exit $ret
+fi
+
+test_tproxy_udp_forward "ip"
+test_tproxy_udp_forward "ip6"
+
+exit $ret