summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-06-19 11:23:14 -0400
committerDavid S. Miller <davem@davemloft.net>2019-06-19 11:23:14 -0400
commit2a54003e7af1eaddc05848dac14f7bcd77301478 (patch)
tree31644496760e3272d55f8248ef49eed16c32ec1d
parent9371a56f7101cc3f12b57db4bfbb6159205211f4 (diff)
parentf71fec47c2df704c7081f946d7e46fe036a4208b (diff)
Merge branch 'xdp-page_pool-fixes-and-in-flight-accounting'
Jesper Dangaard Brouer says: ==================== xdp: page_pool fixes and in-flight accounting This patchset fix page_pool API and users, such that drivers can use it for DMA-mapping. A number of places exist, where the DMA-mapping would not get released/unmapped, all these are fixed. This occurs e.g. when an xdp_frame gets converted to an SKB. As network stack doesn't have any callback for XDP memory models. The patchset also address a shutdown race-condition. Today removing a XDP memory model, based on page_pool, is only delayed one RCU grace period. This isn't enough as redirected xdp_frames can still be in-flight on different queues (remote driver TX, cpumap or veth). We stress that when drivers use page_pool for DMA-mapping, then they MUST use one packet per page. This might change in the future, but more work lies ahead, before we can lift this restriction. This patchset change the page_pool API to be more strict, as in-flight page accounting is added. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c12
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c3
-rw-r--r--drivers/net/veth.c1
-rw-r--r--include/net/page_pool.h69
-rw-r--r--include/net/xdp.h15
-rw-r--r--include/net/xdp_priv.h23
-rw-r--r--include/trace/events/page_pool.h87
-rw-r--r--include/trace/events/xdp.h115
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/page_pool.c95
-rw-r--r--net/core/xdp.c120
12 files changed, 502 insertions, 45 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8e8350b38aa..5e40db8f92e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -545,8 +545,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
}
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_POOL, rq->page_pool);
- if (err)
+ if (err) {
+ page_pool_free(rq->page_pool);
goto err_free;
+ }
for (i = 0; i < wq_sz; i++) {
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
@@ -611,8 +613,6 @@ err_rq_wq_destroy:
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq);
- if (rq->page_pool)
- page_pool_destroy(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl);
return err;
@@ -625,10 +625,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog);
- xdp_rxq_info_unreg(&rq->xdp_rxq);
- if (rq->page_pool)
- page_pool_destroy(rq->page_pool);
-
switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
kvfree(rq->mpwqe.info);
@@ -645,6 +641,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
mlx5e_page_release(rq, dma_info, false);
}
+
+ xdp_rxq_info_unreg(&rq->xdp_rxq);
mlx5_wq_destroy(&rq->wq_ctrl);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 935bf62eddc1..234a3fd39901 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -248,7 +248,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
PAGE_SIZE, rq->buff.map_dir);
if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
- put_page(dma_info->page);
+ page_pool_recycle_direct(rq->page_pool, dma_info->page);
dma_info->page = NULL;
return -ENOMEM;
}
@@ -272,6 +272,7 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
page_pool_recycle_direct(rq->page_pool, dma_info->page);
} else {
mlx5e_page_dma_unmap(rq, dma_info);
+ page_pool_release_page(rq->page_pool, dma_info->page);
put_page(dma_info->page);
}
}
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 52110e54e621..c6916bf1017b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -547,6 +547,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
goto err;
}
+ xdp_release_frame(frame);
xdp_scrub_frame(frame);
skb->protocol = eth_type_trans(skb, rq->dev);
err:
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 694d055e01ef..f09b3f1994e6 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -16,14 +16,16 @@
* page_pool_alloc_pages() call. Drivers should likely use
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
*
- * If page_pool handles DMA mapping (use page->private), then API user
- * is responsible for invoking page_pool_put_page() once. In-case of
- * elevated refcnt, the DMA state is released, assuming other users of
- * the page will eventually call put_page().
+ * API keeps track of in-flight pages, in-order to let API user know
+ * when it is safe to dealloactor page_pool object. Thus, API users
+ * must make sure to call page_pool_release_page() when a page is
+ * "leaving" the page_pool. Or call page_pool_put_page() where
+ * appropiate. For maintaining correct accounting.
*
- * If no DMA mapping is done, then it can act as shim-layer that
- * fall-through to alloc_page. As no state is kept on the page, the
- * regular put_page() call is sufficient.
+ * API user must only call page_pool_put_page() once on a page, as it
+ * will either recycle the page, or in case of elevated refcnt, it
+ * will release the DMA mapping and in-flight state accounting. We
+ * hope to lift this requirement in the future.
*/
#ifndef _NET_PAGE_POOL_H
#define _NET_PAGE_POOL_H
@@ -66,9 +68,10 @@ struct page_pool_params {
};
struct page_pool {
- struct rcu_head rcu;
struct page_pool_params p;
+ u32 pages_state_hold_cnt;
+
/*
* Data structure for allocation side
*
@@ -96,6 +99,8 @@ struct page_pool {
* TODO: Implement bulk return pages into this structure.
*/
struct ptr_ring ring;
+
+ atomic_t pages_state_release_cnt;
};
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -109,7 +114,16 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
struct page_pool *page_pool_create(const struct page_pool_params *params);
-void page_pool_destroy(struct page_pool *pool);
+void __page_pool_free(struct page_pool *pool);
+static inline void page_pool_free(struct page_pool *pool)
+{
+ /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+ * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+ */
+#ifdef CONFIG_PAGE_POOL
+ __page_pool_free(pool);
+#endif
+}
/* Never call this directly, use helpers below */
void __page_pool_put_page(struct page_pool *pool,
@@ -132,6 +146,43 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
__page_pool_put_page(pool, page, true);
}
+/* API user MUST have disconnected alloc-side (not allowed to call
+ * page_pool_alloc_pages()) before calling this. The free-side can
+ * still run concurrently, to handle in-flight packet-pages.
+ *
+ * A request to shutdown can fail (with false) if there are still
+ * in-flight packet-pages.
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool);
+static inline bool page_pool_request_shutdown(struct page_pool *pool)
+{
+ /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+ * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+ */
+#ifdef CONFIG_PAGE_POOL
+ return __page_pool_request_shutdown(pool);
+#endif
+}
+
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page);
+static inline void page_pool_release_page(struct page_pool *pool,
+ struct page *page)
+{
+#ifdef CONFIG_PAGE_POOL
+ page_pool_unmap_page(pool, page);
+#endif
+}
+
+static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
+{
+ return page->dma_addr;
+}
+
static inline bool is_page_pool_compiled_in(void)
{
#ifdef CONFIG_PAGE_POOL
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 8e0deddef35c..40c6d3398458 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -129,6 +129,21 @@ void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
+/* When sending xdp_frame into the network stack, then there is no
+ * return point callback, which is needed to release e.g. DMA-mapping
+ * resources with page_pool. Thus, have explicit function to release
+ * frame resources.
+ */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
+static inline void xdp_release_frame(struct xdp_frame *xdpf)
+{
+ struct xdp_mem_info *mem = &xdpf->mem;
+
+ /* Curr only page_pool needs this */
+ if (mem->type == MEM_TYPE_PAGE_POOL)
+ __xdp_release_frame(xdpf->data, mem);
+}
+
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
new file mode 100644
index 000000000000..6a8cba6ea79a
--- /dev/null
+++ b/include/net/xdp_priv.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_NET_XDP_PRIV_H__
+#define __LINUX_NET_XDP_PRIV_H__
+
+#include <linux/rhashtable.h>
+
+/* Private to net/core/xdp.c, but used by trace/events/xdp.h */
+struct xdp_mem_allocator {
+ struct xdp_mem_info mem;
+ union {
+ void *allocator;
+ struct page_pool *page_pool;
+ struct zero_copy_allocator *zc_alloc;
+ };
+ int disconnect_cnt;
+ unsigned long defer_start;
+ struct rhash_head node;
+ struct rcu_head rcu;
+ struct delayed_work defer_wq;
+ unsigned long defer_warn;
+};
+
+#endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
new file mode 100644
index 000000000000..47b5ee880aa9
--- /dev/null
+++ b/include/trace/events/page_pool.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_pool
+
+#if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_POOL_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#include <net/page_pool.h>
+
+TRACE_EVENT(page_pool_inflight,
+
+ TP_PROTO(const struct page_pool *pool,
+ s32 inflight, u32 hold, u32 release),
+
+ TP_ARGS(pool, inflight, hold, release),
+
+ TP_STRUCT__entry(
+ __field(const struct page_pool *, pool)
+ __field(s32, inflight)
+ __field(u32, hold)
+ __field(u32, release)
+ ),
+
+ TP_fast_assign(
+ __entry->pool = pool;
+ __entry->inflight = inflight;
+ __entry->hold = hold;
+ __entry->release = release;
+ ),
+
+ TP_printk("page_pool=%p inflight=%d hold=%u release=%u",
+ __entry->pool, __entry->inflight, __entry->hold, __entry->release)
+);
+
+TRACE_EVENT(page_pool_state_release,
+
+ TP_PROTO(const struct page_pool *pool,
+ const struct page *page, u32 release),
+
+ TP_ARGS(pool, page, release),
+
+ TP_STRUCT__entry(
+ __field(const struct page_pool *, pool)
+ __field(const struct page *, page)
+ __field(u32, release)
+ ),
+
+ TP_fast_assign(
+ __entry->pool = pool;
+ __entry->page = page;
+ __entry->release = release;
+ ),
+
+ TP_printk("page_pool=%p page=%p release=%u",
+ __entry->pool, __entry->page, __entry->release)
+);
+
+TRACE_EVENT(page_pool_state_hold,
+
+ TP_PROTO(const struct page_pool *pool,
+ const struct page *page, u32 hold),
+
+ TP_ARGS(pool, page, hold),
+
+ TP_STRUCT__entry(
+ __field(const struct page_pool *, pool)
+ __field(const struct page *, page)
+ __field(u32, hold)
+ ),
+
+ TP_fast_assign(
+ __entry->pool = pool;
+ __entry->page = page;
+ __entry->hold = hold;
+ ),
+
+ TP_printk("page_pool=%p page=%p hold=%u",
+ __entry->pool, __entry->page, __entry->hold)
+);
+
+#endif /* _TRACE_PAGE_POOL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e95cb86b65cf..bb5e380e2ef3 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -269,6 +269,121 @@ TRACE_EVENT(xdp_devmap_xmit,
__entry->from_ifindex, __entry->to_ifindex, __entry->err)
);
+/* Expect users already include <net/xdp.h>, but not xdp_priv.h */
+#include <net/xdp_priv.h>
+
+#define __MEM_TYPE_MAP(FN) \
+ FN(PAGE_SHARED) \
+ FN(PAGE_ORDER0) \
+ FN(PAGE_POOL) \
+ FN(ZERO_COPY)
+
+#define __MEM_TYPE_TP_FN(x) \
+ TRACE_DEFINE_ENUM(MEM_TYPE_##x);
+#define __MEM_TYPE_SYM_FN(x) \
+ { MEM_TYPE_##x, #x },
+#define __MEM_TYPE_SYM_TAB \
+ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 }
+__MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
+
+TRACE_EVENT(mem_disconnect,
+
+ TP_PROTO(const struct xdp_mem_allocator *xa,
+ bool safe_to_remove, bool force),
+
+ TP_ARGS(xa, safe_to_remove, force),
+
+ TP_STRUCT__entry(
+ __field(const struct xdp_mem_allocator *, xa)
+ __field(u32, mem_id)
+ __field(u32, mem_type)
+ __field(const void *, allocator)
+ __field(bool, safe_to_remove)
+ __field(bool, force)
+ __field(int, disconnect_cnt)
+ ),
+
+ TP_fast_assign(
+ __entry->xa = xa;
+ __entry->mem_id = xa->mem.id;
+ __entry->mem_type = xa->mem.type;
+ __entry->allocator = xa->allocator;
+ __entry->safe_to_remove = safe_to_remove;
+ __entry->force = force;
+ __entry->disconnect_cnt = xa->disconnect_cnt;
+ ),
+
+ TP_printk("mem_id=%d mem_type=%s allocator=%p"
+ " safe_to_remove=%s force=%s disconnect_cnt=%d",
+ __entry->mem_id,
+ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+ __entry->allocator,
+ __entry->safe_to_remove ? "true" : "false",
+ __entry->force ? "true" : "false",
+ __entry->disconnect_cnt
+ )
+);
+
+TRACE_EVENT(mem_connect,
+
+ TP_PROTO(const struct xdp_mem_allocator *xa,
+ const struct xdp_rxq_info *rxq),
+
+ TP_ARGS(xa, rxq),
+
+ TP_STRUCT__entry(
+ __field(const struct xdp_mem_allocator *, xa)
+ __field(u32, mem_id)
+ __field(u32, mem_type)
+ __field(const void *, allocator)
+ __field(const struct xdp_rxq_info *, rxq)
+ __field(int, ifindex)
+ ),
+
+ TP_fast_assign(
+ __entry->xa = xa;
+ __entry->mem_id = xa->mem.id;
+ __entry->mem_type = xa->mem.type;
+ __entry->allocator = xa->allocator;
+ __entry->rxq = rxq;
+ __entry->ifindex = rxq->dev->ifindex;
+ ),
+
+ TP_printk("mem_id=%d mem_type=%s allocator=%p"
+ " ifindex=%d",
+ __entry->mem_id,
+ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+ __entry->allocator,
+ __entry->ifindex
+ )
+);
+
+TRACE_EVENT(mem_return_failed,
+
+ TP_PROTO(const struct xdp_mem_info *mem,
+ const struct page *page),
+
+ TP_ARGS(mem, page),
+
+ TP_STRUCT__entry(
+ __field(const struct page *, page)
+ __field(u32, mem_id)
+ __field(u32, mem_type)
+ ),
+
+ TP_fast_assign(
+ __entry->page = page;
+ __entry->mem_id = mem->id;
+ __entry->mem_type = mem->type;
+ ),
+
+ TP_printk("mem_id=%d mem_type=%s page=%p",
+ __entry->mem_id,
+ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
+ __entry->page
+ )
+);
+
#endif /* _TRACE_XDP_H */
#include <trace/define_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ee5532cf6a6..8dff08768087 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -208,6 +208,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
/* Allow SKB to reuse area used by xdp_frame */
xdp_scrub_frame(xdpf);
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 470b179d599e..283ddb2dbc7d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
#endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
#include <trace/events/neigh.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5b2252c6d49b..b366f59885c1 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,9 +4,11 @@
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/device.h>
#include <net/page_pool.h>
#include <linux/dma-direction.h>
@@ -14,6 +16,8 @@
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include <trace/events/page_pool.h>
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -43,6 +47,11 @@ static int page_pool_init(struct page_pool *pool,
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ get_device(pool->p.dev);
+
return 0;
}
@@ -151,6 +160,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
page->dma_addr = dma;
skip_dma_map:
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+
+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
}
@@ -173,6 +187,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
}
EXPORT_SYMBOL(page_pool_alloc_pages);
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
+
+ distance = _distance(hold_cnt, release_cnt);
+
+ trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
+ return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+ s32 inflight = page_pool_inflight(pool);
+
+ /* The distance should not be able to become negative */
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+ return (inflight == 0);
+}
+
/* Cleanup page_pool state from page */
static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
@@ -180,7 +221,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
dma_addr_t dma;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- return;
+ goto skip_dma_unmap;
dma = page->dma_addr;
/* DMA unmap */
@@ -188,12 +229,27 @@ static void __page_pool_clean_page(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
+skip_dma_unmap:
+ atomic_inc(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page,
+ atomic_read(&pool->pages_state_release_cnt));
}
+/* unmap the page and clean our state */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page)
+{
+ /* When page is unmapped, this implies page will not be
+ * returned to page_pool.
+ */
+ __page_pool_clean_page(pool, page);
+}
+EXPORT_SYMBOL(page_pool_unmap_page);
+
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
{
__page_pool_clean_page(pool, page);
+
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
@@ -285,21 +341,41 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+static void __warn_in_flight(struct page_pool *pool)
{
- struct page_pool *pool;
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
- pool = container_of(rcu, struct page_pool, rcu);
+ distance = _distance(hold_cnt, release_cnt);
+ /* Drivers should fix this, but only problematic when DMA is used */
+ WARN(1, "Still in-flight pages:%d hold:%u released:%u",
+ distance, hold_cnt, release_cnt);
+}
+
+void __page_pool_free(struct page_pool *pool)
+{
WARN(pool->alloc.count, "API usage violation");
+ WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+
+ /* Can happen due to forced shutdown */
+ if (!__page_pool_safe_to_destroy(pool))
+ __warn_in_flight(pool);
- __page_pool_empty_ring(pool);
ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
kfree(pool);
}
+EXPORT_SYMBOL(__page_pool_free);
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
{
struct page *page;
@@ -317,7 +393,6 @@ void page_pool_destroy(struct page_pool *pool)
*/
__page_pool_empty_ring(pool);
- /* An xdp_mem_allocator can still ref page_pool pointer */
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+ return __page_pool_safe_to_destroy(pool);
}
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8aab08b131d9..b29d7b513a18 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,6 +14,8 @@
#include <net/page_pool.h>
#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
-struct xdp_mem_allocator {
- struct xdp_mem_info mem;
- union {
- void *allocator;
- struct page_pool *page_pool;
- struct zero_copy_allocator *zc_alloc;
- };
- struct rhash_head node;
- struct rcu_head rcu;
-};
-
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
@@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+ /* Allocator have indicated safe to remove before this is called */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ page_pool_free(xa->page_pool);
+
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Notice, driver is expected to free the *allocator,
- * e.g. page_pool, and MUST also use RCU free.
- */
-
/* Poison memory */
xa->mem.id = 0xFFFF;
xa->mem.type = 0xF0F0;
@@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
+bool __mem_id_disconnect(int id, bool force)
+{
+ struct xdp_mem_allocator *xa;
+ bool safe_to_remove = true;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+ return true;
+ }
+ xa->disconnect_cnt++;
+
+ /* Detects in-flight packet-pages for page_pool */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+ trace_mem_disconnect(xa, safe_to_remove, force);
+
+ if ((safe_to_remove || force) &&
+ !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+ return (safe_to_remove|force);
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (30 * HZ)
+#define DEFER_MAX_RETRIES 120
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+ bool force = false;
+
+ if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
+ force = true;
+
+ if (__mem_id_disconnect(xa->mem.id, force))
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, xa->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+
+ pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
+ __func__, xa->mem.id, xa->disconnect_cnt, sec);
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
+ if (__mem_id_disconnect(id, false))
+ return;
+
+ /* Could not disconnect, defer new disconnect attempt to later */
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+ xa->defer_start = jiffies;
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
mutex_unlock(&mem_id_lock);
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -301,12 +364,15 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
+ ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
+ xdp_rxq->mem.id = 0;
errno = PTR_ERR(ptr);
goto err;
}
mutex_unlock(&mem_id_lock);
+ trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
err:
mutex_unlock(&mem_id_lock);
@@ -333,10 +399,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (xa) {
+ if (likely(xa)) {
napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
} else {
+ /* Hopefully stack show who to blame for late return */
+ WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
+ trace_mem_return_failed(mem, page);
put_page(page);
}
rcu_read_unlock();
@@ -379,6 +448,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_release_page(xa->page_pool, page);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__xdp_release_frame);
+
int xdp_attachment_query(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{