summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-08-15 10:35:20 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-08-15 10:35:20 -0700
commita4a35f6cbebbf9466b6c412506ab89299d567f51 (patch)
treeedc5df58c8f5ed10813cdbceb5022fe473d0fc86 /net
parent20573d8e1c2801d6f0cc08d26003248fd118962b (diff)
parent9c5af2d7dfe18e3a36f85fad8204cd2442ecd82b (diff)
Merge tag 'net-6.11-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Paolo Abeni: "Including fixes from wireless and netfilter Current release - regressions: - udp: fall back to software USO if IPv6 extension headers are present - wifi: iwlwifi: correctly lookup DMA address in SG table Current release - new code bugs: - eth: mlx5e: fix queue stats access to non-existing channels splat Previous releases - regressions: - eth: mlx5e: take state lock during tx timeout reporter - eth: mlxbf_gige: disable RX filters until RX path initialized - eth: igc: fix reset adapter logics when tx mode change Previous releases - always broken: - tcp: update window clamping condition - netfilter: - nf_queue: drop packets with cloned unconfirmed conntracks - nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests - vsock: fix recursive ->recvmsg calls - dsa: vsc73xx: fix MDIO bus access and PHY opera - eth: gtp: pull network headers in gtp_dev_xmit() - eth: igc: fix packet still tx after gate close by reducing i226 MAC retry buffer - eth: mana: fix RX buf alloc_size alignment and atomic op panic - eth: hns3: fix a deadlock problem when config TC during resetting" * tag 'net-6.11-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (58 commits) net: hns3: use correct release function during uninitialization net: hns3: void array out of bound when loop tnl_num net: hns3: fix a deadlock problem when config TC during resetting net: hns3: use the user's cfg after reset net: hns3: fix wrong use of semaphore up selftests: net: lib: kill PIDs before del netns pse-core: Conditionally set current limit during PI regulator registration net: thunder_bgx: Fix netdev structure allocation net: ethtool: Allow write mechanism of LPL and both LPL and EPL vsock: fix recursive ->recvmsg calls selftest: af_unix: Fix kselftest compilation warnings netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests netfilter: nf_tables: Introduce nf_tables_getobj_single netfilter: nf_tables: Audit log dump reset after the fact selftests: netfilter: add test for br_netfilter+conntrack+queue combination netfilter: nf_queue: drop packets with cloned unconfirmed conntracks netfilter: flowtable: initialise extack before use netfilter: nfnetlink: Initialise extack before use in ACKs netfilter: allow ipv6 fragments to arrive on different devices tcp: Update window clamping condition ...
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_netfilter_hooks.c6
-rw-r--r--net/core/dev.c26
-rw-r--r--net/ethtool/cmis_fw_update.c8
-rw-r--r--net/ipv4/tcp_input.c28
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c4
-rw-r--r--net/mptcp/diag.c2
-rw-r--r--net/netfilter/nf_flow_table_offload.c2
-rw-r--r--net/netfilter/nf_tables_api.c147
-rw-r--r--net/netfilter/nfnetlink.c5
-rw-r--r--net/netfilter/nfnetlink_queue.c35
-rw-r--r--net/vmw_vsock/af_vsock.c50
-rw-r--r--net/vmw_vsock/vsock_bpf.c4
13 files changed, 222 insertions, 101 deletions
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 09f6a773a708..8f9c19d992ac 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -622,8 +622,12 @@ static unsigned int br_nf_local_in(void *priv,
if (likely(nf_ct_is_confirmed(ct)))
return NF_ACCEPT;
+ if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
WARN_ON_ONCE(skb_shared(skb));
- WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
/* We can't call nf_confirm here, it would create a dependency
* on nf_conntrack module.
diff --git a/net/core/dev.c b/net/core/dev.c
index 751d9b70e6ad..f66e61407883 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9912,6 +9912,15 @@ static void netdev_sync_lower_features(struct net_device *upper,
}
}
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ return ip_csum || hw_csum;
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -9993,15 +10002,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_LRO;
}
- if (features & NETIF_F_HW_TLS_TX) {
- bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
- (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- bool hw_csum = features & NETIF_F_HW_CSUM;
-
- if (!ip_csum && !hw_csum) {
- netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
- features &= ~NETIF_F_HW_TLS_TX;
- }
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
@@ -10009,6 +10012,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
features &= ~NETIF_F_HW_TLS_RX;
}
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
return features;
}
diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c
index ae4b4b28a601..655ff5224ffa 100644
--- a/net/ethtool/cmis_fw_update.c
+++ b/net/ethtool/cmis_fw_update.c
@@ -35,7 +35,10 @@ struct cmis_cdb_fw_mng_features_rpl {
__be16 resv7;
};
-#define CMIS_CDB_FW_WRITE_MECHANISM_LPL 0x01
+enum cmis_cdb_fw_write_mechanism {
+ CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01,
+ CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11,
+};
static int
cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
@@ -64,7 +67,8 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
}
rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload;
- if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL)) {
+ if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ||
+ rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_BOTH)) {
ethnl_module_fw_flash_ntf_err(dev, ntf_params,
"Write LPL is not supported",
NULL);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e2b9583ed96a..e37488d3453f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -238,9 +238,14 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ u8 old_ratio = tcp_sk(sk)->scaling_ratio;
do_div(val, skb->truesize);
tcp_sk(sk)->scaling_ratio = val ? val : 1;
+
+ if (old_ratio != tcp_sk(sk)->scaling_ratio)
+ WRITE_ONCE(tcp_sk(sk)->window_clamp,
+ tcp_win_from_space(sk, sk->sk_rcvbuf));
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
@@ -754,7 +759,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
u64 rcvwin, grow;
int rcvbuf;
@@ -770,22 +776,12 @@ void tcp_rcv_space_adjust(struct sock *sk)
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- if (rcvbuf > sk->sk_rcvbuf) {
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
-
- /* Make the window clamp follow along. */
- WRITE_ONCE(tp->window_clamp,
- tcp_win_from_space(sk, rcvbuf));
- }
- } else {
- /* Make the window clamp follow along while being bounded
- * by SO_RCVBUF.
- */
- int clamp = tcp_win_from_space(sk, min(rcvbuf, sk->sk_rcvbuf));
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
- if (clamp > tp->window_clamp)
- WRITE_ONCE(tp->window_clamp, clamp);
+ /* Make the window clamp follow along. */
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
}
}
tp->rcvq_space.space = copied;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index bc8a9da750fe..b254a5dadfcf 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -282,6 +282,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
skb_transport_header(gso_skb)))
return ERR_PTR(-EINVAL);
+ /* We don't know if egress device can segment and checksum the packet
+ * when IPv6 extension headers are present. Fall back to software GSO.
+ */
+ if (gso_skb->ip_summed != CHECKSUM_PARTIAL)
+ features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK);
+
if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6f0844c9315d..4120e67a8ce6 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -154,6 +154,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
+ if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)))
+ key.iif = 0;
+
q = inet_frag_find(nf_frag->fqdir, &key);
if (!q)
return NULL;
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
index 3ae46b545d2c..2d3efb405437 100644
--- a/net/mptcp/diag.c
+++ b/net/mptcp/diag.c
@@ -94,7 +94,7 @@ static size_t subflow_get_info_size(const struct sock *sk)
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
- nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index ff1a4e36c2b5..e06bc36f49fe 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -841,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
+ struct netlink_ext_ack extack = {};
struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
__be16 proto = ETH_P_ALL;
int err, i = 0;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 481ee78e77bc..0a2f79346958 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8020,6 +8020,19 @@ cont:
return skb->len;
}
+static int nf_tables_dumpreset_obj(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_obj(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
@@ -8036,12 +8049,18 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb)
if (nla[NFTA_OBJ_TYPE])
ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- ctx->reset = true;
-
return 0;
}
+static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
+{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_obj_start(cb);
+}
+
static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
@@ -8052,8 +8071,9 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
- const struct nlattr * const nla[])
+static struct sk_buff *
+nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_cur(info->net);
@@ -8062,72 +8082,109 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
struct net *net = info->net;
struct nft_object *obj;
struct sk_buff *skb2;
- bool reset = false;
u32 objtype;
int err;
- if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start = nf_tables_dump_obj_start,
- .dump = nf_tables_dump_obj,
- .done = nf_tables_dump_obj_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
- }
-
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
- return PTR_ERR(obj);
+ return ERR_CAST(obj);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ err = nf_tables_fill_obj_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ family, table, obj, reset);
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- if (reset) {
- const struct nftables_pernet *nft_net;
- char *buf;
+ return skb2;
+}
- nft_net = nft_pernet(net);
- buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct sk_buff *skb2;
- audit_log_nfcfg(buf,
- family,
- 1,
- AUDIT_NFT_OP_OBJ_RESET,
- GFP_ATOMIC);
- kfree(buf);
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_obj_start,
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
- info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
- family, table, obj, reset);
- if (err < 0)
- goto err_fill_obj_info;
+ skb2 = nf_tables_getobj_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
- return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, info->net, portid);
+}
-err_fill_obj_info:
- kfree_skb(skb2);
- return err;
+static int nf_tables_getobj_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dumpreset_obj_start,
+ .dump = nf_tables_dumpreset_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getobj_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_OBJ_TABLE]),
+ (char *)nla_data(nla[NFTA_OBJ_TABLE]),
+ nft_net->base_seq);
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
@@ -9410,7 +9467,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call = nf_tables_getobj,
+ .call = nf_tables_getobj_reset,
.type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4abf660c7baf..932b3ddb34f1 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -427,8 +427,10 @@ replay_abort:
nfnl_unlock(subsys_id);
- if (nlh->nlmsg_flags & NLM_F_ACK)
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
nfnl_err_add(&err_list, nlh, 0, &extack);
+ }
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -577,6 +579,7 @@ done:
ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
} else if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
nfnl_err_add(&err_list, nlh, 0, &extack);
}
} else {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 55e28e1da66e..e0716da256bf 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -820,10 +820,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- const struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ unsigned long status;
+ unsigned int use;
- if (ct && ((ct->status & flags) == IPS_DYING))
+ if (!ct)
+ return false;
+
+ status = READ_ONCE(ct->status);
+ if ((status & flags) == IPS_DYING)
return true;
+
+ if (status & IPS_CONFIRMED)
+ return false;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * We can't be solved with serialization here because one clone could
+ * have been queued for local delivery.
+ */
+ use = refcount_read(&ct->ct_general.use);
+ if (likely(use == 1))
+ return false;
+
+ /* Can't decrement further? Exclusive ownership. */
+ if (!refcount_dec_not_one(&ct->ct_general.use))
+ return false;
+
+ skb_set_nfct(entry->skb, 0);
+ /* No nf_ct_put(): we already decremented .use and it cannot
+ * drop down to 0.
+ */
+ return true;
#endif
return false;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 4b040285aa78..0ff9b2dd86ba 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1270,25 +1270,28 @@ out:
return err;
}
+int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
+}
+
int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
#ifdef CONFIG_BPF_SYSCALL
+ struct sock *sk = sock->sk;
const struct proto *prot;
-#endif
- struct vsock_sock *vsk;
- struct sock *sk;
- sk = sock->sk;
- vsk = vsock_sk(sk);
-
-#ifdef CONFIG_BPF_SYSCALL
prot = READ_ONCE(sk->sk_prot);
if (prot != &vsock_proto)
return prot->recvmsg(sk, msg, len, flags, NULL);
#endif
- return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
+ return __vsock_dgram_recvmsg(sock, msg, len, flags);
}
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
@@ -2174,15 +2177,12 @@ out:
}
int
-vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+__vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
{
struct sock *sk;
struct vsock_sock *vsk;
const struct vsock_transport *transport;
-#ifdef CONFIG_BPF_SYSCALL
- const struct proto *prot;
-#endif
int err;
sk = sock->sk;
@@ -2233,14 +2233,6 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
goto out;
}
-#ifdef CONFIG_BPF_SYSCALL
- prot = READ_ONCE(sk->sk_prot);
- if (prot != &vsock_proto) {
- release_sock(sk);
- return prot->recvmsg(sk, msg, len, flags, NULL);
- }
-#endif
-
if (sk->sk_type == SOCK_STREAM)
err = __vsock_stream_recvmsg(sk, msg, len, flags);
else
@@ -2250,6 +2242,22 @@ out:
release_sock(sk);
return err;
}
+
+int
+vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot != &vsock_proto)
+ return prot->recvmsg(sk, msg, len, flags, NULL);
+#endif
+
+ return __vsock_connectible_recvmsg(sock, msg, len, flags);
+}
EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
static int vsock_set_rcvlowat(struct sock *sk, int val)
diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
index a3c97546ab84..c42c5cc18f32 100644
--- a/net/vmw_vsock/vsock_bpf.c
+++ b/net/vmw_vsock/vsock_bpf.c
@@ -64,9 +64,9 @@ static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int
int err;
if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
- err = vsock_connectible_recvmsg(sock, msg, len, flags);
+ err = __vsock_connectible_recvmsg(sock, msg, len, flags);
else if (sk->sk_type == SOCK_DGRAM)
- err = vsock_dgram_recvmsg(sock, msg, len, flags);
+ err = __vsock_dgram_recvmsg(sock, msg, len, flags);
else
err = -EPROTOTYPE;