diff options
109 files changed, 2776 insertions, 2887 deletions
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 38c15c6fcb14..0b3db91dc100 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -437,6 +437,21 @@ needed:: See the kernels selftest `Documentation/dev-tools/kselftest.rst`_ document for further documentation. +To maximize the number of tests passing, the .config of the kernel +under test should match the config file fragment in +tools/testing/selftests/bpf as closely as possible. + +Finally to ensure support for latest BPF Type Format features - +discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16 +is required for kernels built with CONFIG_DEBUG_INFO_BTF=y. +pahole is delivered in the dwarves package or can be built +from source at + +https://github.com/acmel/dwarves + +Some distros have pahole version 1.16 packaged already, e.g. +Fedora, Gentoo. + Q: Which BPF kernel selftests version should I run my kernel against? --------------------------------------------------------------------- A: If you run a kernel ``xyz``, then always run the BPF kernel selftests diff --git a/MAINTAINERS b/MAINTAINERS index b7844f6cfa4a..087e68b21f9f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18443,8 +18443,12 @@ R: Jonathan Lemon <jonathan.lemon@gmail.com> L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained -F: kernel/bpf/xskmap.c +F: include/net/xdp_sock* +F: include/net/xsk_buffer_pool.h +F: include/uapi/linux/if_xdp.h F: net/xdp/ +F: samples/bpf/xdpsock* +F: tools/lib/bpf/xsk* XEN BLOCK SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2a037ec244b9..ea7395b391e5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11,7 +11,7 @@ #include "i40e_diag.h" #include "i40e_xsk.h" #include <net/udp_tunnel.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* All i40e tracepoints are defined by the include below, which * must be included exactly once across the whole kernel with * CREATE_TRACE_POINTS defined @@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (ring->vsi->type == I40E_VSI_MAIN) xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); + kfree(ring->rx_bi); ring->xsk_umem = i40e_xsk_umem(ring); if (ring->xsk_umem) { - ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + ret = i40e_alloc_rx_bi_zc(ring); + if (ret) + return ret; + ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem); /* For AF_XDP ZC, we disallow packets to span on * multiple buffers, thus letting us skip that * handling in the fast-path. */ chain_len = 1; - ring->zca.free = i40e_zca_free; ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca); + MEM_TYPE_XSK_BUFF_POOL, + NULL); if (ret) return ret; dev_info(&vsi->back->pdev->dev, - "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->queue_index); } else { + ret = i40e_alloc_rx_bi(ring); + if (ret) + return ret; ring->rx_buf_len = vsi->rx_buf_len; if (ring->vsi->type == I40E_VSI_MAIN) { ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); writel(0, ring->tail); - ok = ring->xsk_umem ? - i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) : - !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + if (ring->xsk_umem) { + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); + ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)); + } else { + ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); + } if (!ok) { /* Log this in case the user has forgotten to give the kernel * any buffers, even later in the application. diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index a3772beffe02..f613782f2f56 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi, /** * i40e_fd_handle_status - check the Programming Status for FD * @rx_ring: the Rx ring for this descriptor - * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor. + * @qword0_raw: qword0 + * @qword1: qword1 after le_to_cpu * @prog_id: the id originally used for programming * * This is used to verify if the FD programming or invalidation * requested by SW to the HW is successful or not and take actions accordingly. **/ -void i40e_fd_handle_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, u8 prog_id) +static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1, u8 prog_id) { struct i40e_pf *pf = rx_ring->vsi->back; struct pci_dev *pdev = pf->pdev; + struct i40e_32b_rx_wb_qw0 *qw0; u32 fcnt_prog, fcnt_avail; u32 error; - u64 qw; - qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len); - error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >> + qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw; + error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >> I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT; if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) { - pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id); - if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) || + pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id); + if (qw0->hi_dword.fd_id != 0 || (I40E_DEBUG_FD & pf->hw.debug_mask)) dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n", pf->fd_inv); @@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring, /* store the current atr filter count */ pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf); - if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) && + if (qw0->hi_dword.fd_id == 0 && test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) { /* These set_bit() calls aren't atomic with the * test_bit() here, but worse case we potentially @@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring, } else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) { if (I40E_DEBUG_FD & pf->hw.debug_mask) dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n", - rx_desc->wb.qword0.hi_dword.fd_id); + qw0->hi_dword.fd_id); } } @@ -1195,6 +1196,11 @@ clear_counts: rc->total_packets = 0; } +static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx) +{ + return &rx_ring->rx_bi[idx]; +} + /** * i40e_reuse_rx_page - page flip buffer and store it back on the ring * @rx_ring: rx descriptor ring to store buffers on @@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, struct i40e_rx_buffer *new_buff; u16 nta = rx_ring->next_to_alloc; - new_buff = &rx_ring->rx_bi[nta]; + new_buff = i40e_rx_bi(rx_ring, nta); /* update, and store next to alloc */ nta++; @@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring, } /** - * i40e_rx_is_programming_status - check for programming status descriptor - * @qw: qword representing status_error_len in CPU ordering - * - * The value of in the descriptor length field indicate if this - * is a programming status descriptor for flow director or FCoE - * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise - * it is a packet descriptor. - **/ -static inline bool i40e_rx_is_programming_status(u64 qw) -{ - /* The Rx filter programming status and SPH bit occupy the same - * spot in the descriptor. Since we don't support packet split we - * can just reuse the bit as an indication that this is a - * programming status descriptor. - */ - return qw & I40E_RXD_QW1_LENGTH_SPH_MASK; -} - -/** - * i40e_clean_programming_status - try clean the programming status descriptor + * i40e_clean_programming_status - clean the programming status descriptor * @rx_ring: the rx ring that has this descriptor - * @rx_desc: the rx descriptor written back by HW - * @qw: qword representing status_error_len in CPU ordering + * @qword0_raw: qword0 + * @qword1: qword1 representing status_error_len in CPU ordering * * Flow director should handle FD_FILTER_STATUS to check its filter programming * status being successful or not and take actions accordingly. FCoE should @@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw) * * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL. **/ -struct i40e_rx_buffer *i40e_clean_programming_status( - struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, - u64 qw) +void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1) { - struct i40e_rx_buffer *rx_buffer; - u32 ntc; u8 id; - if (!i40e_rx_is_programming_status(qw)) - return NULL; - - ntc = rx_ring->next_to_clean; - - /* fetch, update, and store next to clean */ - rx_buffer = &rx_ring->rx_bi[ntc++]; - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - - prefetch(I40E_RX_DESC(rx_ring, ntc)); - - id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> + id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >> I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT; if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS) - i40e_fd_handle_status(rx_ring, rx_desc, id); - - return rx_buffer; + i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id); } /** @@ -1336,13 +1305,25 @@ err: return -ENOMEM; } +int i40e_alloc_rx_bi(struct i40e_ring *rx_ring) +{ + unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count; + + rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL); + return rx_ring->rx_bi ? 0 : -ENOMEM; +} + +static void i40e_clear_rx_bi(struct i40e_ring *rx_ring) +{ + memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count); +} + /** * i40e_clean_rx_ring - Free Rx buffers * @rx_ring: ring to be cleaned **/ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) { - unsigned long bi_size; u16 i; /* ring already cleared, nothing to do */ @@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; + struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i); if (!rx_bi->page) continue; @@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) } skip_free: - bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; - memset(rx_ring->rx_bi, 0, bi_size); + if (rx_ring->xsk_umem) + i40e_clear_rx_bi_zc(rx_ring); + else + i40e_clear_rx_bi(rx_ring); /* Zero out the descriptor ring */ memset(rx_ring->desc, 0, rx_ring->size); @@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) { struct device *dev = rx_ring->dev; - int err = -ENOMEM; - int bi_size; - - /* warn if we are about to overwrite the pointer */ - WARN_ON(rx_ring->rx_bi); - bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; - rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL); - if (!rx_ring->rx_bi) - goto err; + int err; u64_stats_init(&rx_ring->syncp); @@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) if (!rx_ring->desc) { dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n", rx_ring->size); - goto err; + return -ENOMEM; } rx_ring->next_to_alloc = 0; @@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->queue_index); if (err < 0) - goto err; + return err; } rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; return 0; -err: - kfree(rx_ring->rx_bi); - rx_ring->rx_bi = NULL; - return err; } /** @@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) return false; rx_desc = I40E_RX_DESC(rx_ring, ntu); - bi = &rx_ring->rx_bi[ntu]; + bi = i40e_rx_bi(rx_ring, ntu); do { if (!i40e_alloc_mapped_page(rx_ring, bi)) @@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) ntu++; if (unlikely(ntu == rx_ring->count)) { rx_desc = I40E_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_bi; + bi = i40e_rx_bi(rx_ring, 0); ntu = 0; } @@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring, { struct i40e_rx_buffer *rx_buffer; - rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean]; + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); prefetchw(rx_buffer->page); /* we are reusing so sync this buffer for CPU use */ @@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) */ dma_rmb(); - rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc, - qword); - if (unlikely(rx_buffer)) { + if (i40e_rx_is_programming_status(qword)) { + i40e_clean_programming_status(rx_ring, + rx_desc->raw.qword[0], + qword); + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + i40e_inc_ntc(rx_ring); i40e_reuse_rx_page(rx_ring, rx_buffer); cleaned_count++; continue; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 36d37f31a287..5c255977fd58 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -296,17 +296,9 @@ struct i40e_tx_buffer { struct i40e_rx_buffer { dma_addr_t dma; - union { - struct { - struct page *page; - __u32 page_offset; - __u16 pagecnt_bias; - }; - struct { - void *addr; - u64 handle; - }; - }; + struct page *page; + __u32 page_offset; + __u16 pagecnt_bias; }; struct i40e_queue_stats { @@ -358,6 +350,7 @@ struct i40e_ring { union { struct i40e_tx_buffer *tx_bi; struct i40e_rx_buffer *rx_bi; + struct xdp_buff **rx_bi_zc; }; DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS); u16 queue_index; /* Queue number of ring */ @@ -419,7 +412,6 @@ struct i40e_ring { struct i40e_channel *ch; struct xdp_rxq_info xdp_rxq; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* ZC allocator anchor */ } ____cacheline_internodealigned_in_smp; static inline bool ring_uses_build_skb(struct i40e_ring *ring) @@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); +int i40e_alloc_rx_bi(struct i40e_ring *rx_ring); /** * i40e_get_head - Retrieve head from head writeback diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index 8af0e99c6c0d..667c4dc4b39f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -4,13 +4,9 @@ #ifndef I40E_TXRX_COMMON_ #define I40E_TXRX_COMMON_ -void i40e_fd_handle_status(struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, u8 prog_id); int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring); -struct i40e_rx_buffer *i40e_clean_programming_status( - struct i40e_ring *rx_ring, - union i40e_rx_desc *rx_desc, - u64 qw); +void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw, + u64 qword1); void i40e_process_skb_fields(struct i40e_ring *rx_ring, union i40e_rx_desc *rx_desc, struct sk_buff *skb); void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring); @@ -84,6 +80,38 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring, } } +/** + * i40e_rx_is_programming_status - check for programming status descriptor + * @qword1: qword1 representing status_error_len in CPU ordering + * + * The value of in the descriptor length field indicate if this + * is a programming status descriptor for flow director or FCoE + * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise + * it is a packet descriptor. + **/ +static inline bool i40e_rx_is_programming_status(u64 qword1) +{ + /* The Rx filter programming status and SPH bit occupy the same + * spot in the descriptor. Since we don't support packet split we + * can just reuse the bit as an indication that this is a + * programming status descriptor. + */ + return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK; +} + +/** + * i40e_inc_ntc: Advance the next_to_clean index + * @rx_ring: Rx ring + **/ +static inline void i40e_inc_ntc(struct i40e_ring *rx_ring) +{ + u32 ntc = rx_ring->next_to_clean + 1; + + ntc = (ntc < rx_ring->count) ? ntc : 0; + rx_ring->next_to_clean = ntc; + prefetch(I40E_RX_DESC(rx_ring, ntc)); +} + void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring); void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring); bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi); diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 6ea2867ff60f..63e098f7cb63 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -689,7 +689,7 @@ union i40e_32byte_rx_desc { __le64 rsvd2; } read; struct { - struct { + struct i40e_32b_rx_wb_qw0 { struct { union { __le16 mirroring_status; @@ -727,6 +727,9 @@ union i40e_32byte_rx_desc { } hi_dword; } qword3; } wb; /* writeback */ + struct { + u64 qword[4]; + } raw; }; enum i40e_rx_desc_status_bits { diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 2b9184aead5f..f3953744c505 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,68 +2,30 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "i40e.h" #include "i40e_txrx_common.h" #include "i40e_xsk.h" -/** - * i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev - * @vsi: Current VSI - * @umem: UMEM to DMA map - * - * Returns 0 on success, <0 on failure - **/ -static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem) +int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring) { - struct i40e_pf *pf = vsi->back; - struct device *dev; - unsigned int i, j; - dma_addr_t dma; - - dev = &pf->pdev->dev; - for (i = 0; i < umem->npgs; i++) { - dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) - goto out_unmap; + unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count; - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (j = 0; j < i; j++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -1; + rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL); + return rx_ring->rx_bi_zc ? 0 : -ENOMEM; } -/** - * i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev - * @vsi: Current VSI - * @umem: UMEM to DMA map - **/ -static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem) +void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring) { - struct i40e_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = &pf->pdev->dev; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR); + memset(rx_ring->rx_bi_zc, 0, + sizeof(*rx_ring->rx_bi_zc) * rx_ring->count); +} - umem->pages[i].dma = 0; - } +static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx) +{ + return &rx_ring->rx_bi_zc[idx]; } /** @@ -78,7 +40,6 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid) { struct net_device *netdev = vsi->netdev; - struct xdp_umem_fq_reuse *reuseq; bool if_running; int err; @@ -92,13 +53,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, qid >= netdev->real_num_tx_queues) return -EINVAL; - reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = i40e_xsk_umem_dma_map(vsi, umem); + err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR); if (err) return err; @@ -151,7 +106,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid) } clear_bit(qid, vsi->af_xdp_zc_qps); - i40e_xsk_umem_dma_unmap(vsi, umem); + xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR); if (if_running) { err = i40e_queue_pair_enable(vsi, qid); @@ -190,11 +145,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, **/ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) { - struct xdp_umem *umem = rx_ring->xsk_umem; int err, result = I40E_XDP_PASS; struct i40e_ring *xdp_ring; struct bpf_prog *xdp_prog; - u64 offset; u32 act; rcu_read_lock(); @@ -203,9 +156,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) */ xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); - offset = xdp->data - xdp->data_hard_start; - - xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset); switch (act) { case XDP_PASS: @@ -232,107 +182,26 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) return result; } -/** - * i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer - * @rx_ring: Rx ring - * @bi: Rx buffer to populate - * - * This function allocates an Rx buffer. The buffer can come from fill - * queue, or via the recycle queue (next_to_alloc). - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = bi->addr; - u64 handle, hr; - - if (addr) { - rx_ring->rx_stats.page_reuse_count++; - return true; - } - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr(umem); - return true; -} - -/** - * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer - * @rx_ring: Rx ring - * @bi: Rx buffer to populate - * - * This function allocates an Rx buffer. The buffer can come from fill - * queue, or via the reuse queue. - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, hr; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - handle &= rx_ring->xsk_umem->chunk_mask; - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr_rq(umem); - return true; -} - -static __always_inline bool -__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count, - bool alloc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi)) +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) { u16 ntu = rx_ring->next_to_use; union i40e_rx_desc *rx_desc; - struct i40e_rx_buffer *bi; + struct xdp_buff **bi, *xdp; + dma_addr_t dma; bool ok = true; rx_desc = I40E_RX_DESC(rx_ring, ntu); - bi = &rx_ring->rx_bi[ntu]; + bi = i40e_rx_bi(rx_ring, ntu); do { - if (!alloc(rx_ring, bi)) { + xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!xdp) { ok = false; goto no_buffers; } - - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); - - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); + *bi = xdp; + dma = xsk_buff_xdp_get_dma(xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); + rx_desc->read.hdr_addr = 0; rx_desc++; bi++; @@ -340,11 +209,10 @@ __i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count, if (unlikely(ntu == rx_ring->count)) { rx_desc = I40E_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_bi; + bi = i40e_rx_bi(rx_ring, 0); ntu = 0; } - rx_desc->wb.qword1.status_error_len = 0; count--; } while (count); @@ -356,127 +224,8 @@ no_buffers: } /** - * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers - * @rx_ring: Rx ring - * @count: The number of buffers to allocate - * - * This function allocates a number of Rx buffers from the reuse queue - * or fill ring and places them on the Rx ring. - * - * Returns true for a successful allocation, false otherwise - **/ -bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) -{ - return __i40e_alloc_rx_buffers_zc(rx_ring, count, - i40e_alloc_buffer_slow_zc); -} - -/** - * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers - * @rx_ring: Rx ring - * @count: The number of buffers to allocate - * - * This function allocates a number of Rx buffers from the fill ring - * or the internal recycle mechanism and places them on the Rx ring. - * - * Returns true for a successful allocation, false otherwise - **/ -static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count) -{ - return __i40e_alloc_rx_buffers_zc(rx_ring, count, - i40e_alloc_buffer_zc); -} - -/** - * i40e_get_rx_buffer_zc - Return the current Rx buffer - * @rx_ring: Rx ring - * @size: The size of the rx buffer (read from descriptor) - * - * This function returns the current, received Rx buffer, and also - * does DMA synchronization. the Rx ring. - * - * Returns the received Rx buffer - **/ -static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring, - const unsigned int size) -{ - struct i40e_rx_buffer *bi; - - bi = &rx_ring->rx_bi[rx_ring->next_to_clean]; - - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - bi->dma, 0, - size, - DMA_BIDIRECTIONAL); - - return bi; -} - -/** - * i40e_reuse_rx_buffer_zc - Recycle an Rx buffer - * @rx_ring: Rx ring - * @old_bi: The Rx buffer to recycle - * - * This function recycles a finished Rx buffer, and places it on the - * recycle queue (next_to_alloc). - **/ -static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *old_bi) -{ - struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc]; - u16 nta = rx_ring->next_to_alloc; - - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - new_bi->dma = old_bi->dma; - new_bi->addr = old_bi->addr; - new_bi->handle = old_bi->handle; - - old_bi->addr = NULL; -} - -/** - * i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations - * @alloc: Zero-copy allocator - * @handle: Buffer handle - **/ -void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) -{ - struct i40e_rx_buffer *bi; - struct i40e_ring *rx_ring; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(alloc, struct i40e_ring, zca); - hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - mask = rx_ring->xsk_umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - bi = &rx_ring->rx_bi[nta]; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle, - rx_ring->xsk_umem->headroom); -} - -/** * i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer * @rx_ring: Rx ring - * @bi: Rx buffer * @xdp: xdp_buff * * This functions allocates a new skb from a zero-copy Rx buffer. @@ -484,7 +233,6 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) * Returns the skb, or NULL on failure. **/ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *bi, struct xdp_buff *xdp) { unsigned int metasize = xdp->data - xdp->data_meta; @@ -503,24 +251,11 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, if (metasize) skb_metadata_set(skb, metasize); - i40e_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(xdp); return skb; } /** - * i40e_inc_ntc: Advance the next_to_clean index - * @rx_ring: Rx ring - **/ -static void i40e_inc_ntc(struct i40e_ring *rx_ring) -{ - u32 ntc = rx_ring->next_to_clean + 1; - - ntc = (ntc < rx_ring->count) ? ntc : 0; - rx_ring->next_to_clean = ntc; - prefetch(I40E_RX_DESC(rx_ring, ntc)); -} - -/** * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring * @rx_ring: Rx ring * @budget: NAPI budget @@ -531,25 +266,20 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { - struct i40e_rx_buffer *bi; union i40e_rx_desc *rx_desc; + struct xdp_buff **bi; unsigned int size; u64 qword; if (cleaned_count >= I40E_RX_BUFFER_WRITE) { failure = failure || - !i40e_alloc_rx_buffers_fast_zc(rx_ring, - cleaned_count); + !i40e_alloc_rx_buffers_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -562,35 +292,36 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) */ dma_rmb(); - bi = i40e_clean_programming_status(rx_ring, rx_desc, - qword); - if (unlikely(bi)) { - i40e_reuse_rx_buffer_zc(rx_ring, bi); + if (i40e_rx_is_programming_status(qword)) { + i40e_clean_programming_status(rx_ring, + rx_desc->raw.qword[0], + qword); + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + xsk_buff_free(*bi); + *bi = NULL; cleaned_count++; + i40e_inc_ntc(rx_ring); continue; } + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; if (!size) break; - bi = i40e_get_rx_buffer_zc(rx_ring, size); - xdp.data = bi->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = bi->handle; + bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + (*bi)->data_end = (*bi)->data + size; + xsk_buff_dma_sync_for_cpu(*bi); - xdp_res = i40e_run_xdp_zc(rx_ring, &xdp); + xdp_res = i40e_run_xdp_zc(rx_ring, *bi); if (xdp_res) { - if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) { + if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) xdp_xmit |= xdp_res; - bi->addr = NULL; - } else { - i40e_reuse_rx_buffer_zc(rx_ring, bi); - } + else + xsk_buff_free(*bi); + *bi = NULL; total_rx_bytes += size; total_rx_packets++; @@ -606,7 +337,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) * BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that * SBP is *not* set in PRT_SBPVSI (default not set). */ - skb = i40e_construct_skb_zc(rx_ring, bi, &xdp); + skb = i40e_construct_skb_zc(rx_ring, *bi); + *bi = NULL; if (!skb) { rx_ring->rx_stats.alloc_buff_failed++; break; @@ -664,10 +396,9 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; tx_bi->bytecount = desc.len; @@ -826,13 +557,13 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring) u16 i; for (i = 0; i < rx_ring->count; i++) { - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; + struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i); - if (!rx_bi->addr) + if (!rx_bi) continue; - xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle); - rx_bi->addr = NULL; + xsk_buff_free(rx_bi); + rx_bi = NULL; } } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 9ed59c14eb55..ea919a7d60ec 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -12,12 +12,13 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair); int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair); int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid); -void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count); int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget); bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi, struct i40e_ring *tx_ring, int napi_budget); int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags); +int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring); +void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring); #endif /* _I40E_XSK_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 4c835c144907..f066befadd39 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019, Intel Corporation. */ +#include <net/xdp_sock_drv.h> #include "ice_base.h" #include "ice_dcb_lib.h" @@ -308,24 +309,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (ring->xsk_umem) { xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); - ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + ring->rx_buf_len = + xsk_umem_get_rx_frame_size(ring->xsk_umem); /* For AF_XDP ZC, we disallow packets to span on * multiple buffers, thus letting us skip that * handling in the fast-path. */ chain_len = 1; - ring->zca.free = ice_zca_free; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca); + MEM_TYPE_XSK_BUFF_POOL, + NULL); if (err) return err; + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); - dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", ring->q_index); } else { - ring->zca.free = NULL; if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, @@ -426,7 +426,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) writel(0, ring->tail); err = ring->xsk_umem ? - ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) : + ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) : ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring)); if (err) dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 025dd642cf28..5f5a6ce2b7e5 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -158,17 +158,16 @@ struct ice_tx_offload_params { }; struct ice_rx_buf { - struct sk_buff *skb; - dma_addr_t dma; union { struct { + struct sk_buff *skb; + dma_addr_t dma; struct page *page; unsigned int page_offset; u16 pagecnt_bias; }; struct { - void *addr; - u64 handle; + struct xdp_buff *xdp; }; }; }; @@ -292,7 +291,6 @@ struct ice_ring { struct rcu_head rcu; /* to avoid race on free */ struct bpf_prog *xdp_prog; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* CL3 - 3rd cacheline starts here */ struct xdp_rxq_info xdp_rxq; /* CLX - the below items are only accessed infrequently and should be diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 20ac54e3156d..b6f928c9e9c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -2,7 +2,7 @@ /* Copyright (c) 2019, Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "ice.h" #include "ice_base.h" @@ -280,28 +280,6 @@ static int ice_xsk_alloc_umems(struct ice_vsi *vsi) } /** - * ice_xsk_add_umem - add a UMEM region for XDP sockets - * @vsi: VSI to which the UMEM will be added - * @umem: pointer to a requested UMEM region - * @qid: queue ID - * - * Returns 0 on success, negative on error - */ -static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) -{ - int err; - - err = ice_xsk_alloc_umems(vsi); - if (err) - return err; - - vsi->xsk_umems[qid] = umem; - vsi->num_xsk_umems_used++; - - return 0; -} - -/** * ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid * @vsi: VSI from which the VSI will be removed * @qid: Ring/qid associated with the UMEM @@ -318,65 +296,6 @@ static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid) } } -/** - * ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets - * @vsi: VSI to map the UMEM region - * @umem: UMEM to map - * - * Returns 0 on success, negative on error - */ -static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem) -{ - struct ice_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = ice_pf_to_dev(pf); - for (i = 0; i < umem->npgs; i++) { - dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0, - PAGE_SIZE, - DMA_BIDIRECTIONAL, - ICE_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) { - dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n", - i); - goto out_unmap; - } - - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (; i > 0; i--) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -EFAULT; -} - -/** - * ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets - * @vsi: VSI from which the UMEM will be unmapped - * @umem: UMEM to unmap - */ -static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem) -{ - struct ice_pf *pf = vsi->back; - struct device *dev; - unsigned int i; - - dev = ice_pf_to_dev(pf); - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR); - - umem->pages[i].dma = 0; - } -} /** * ice_xsk_umem_disable - disable a UMEM region @@ -391,7 +310,7 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid) !vsi->xsk_umems[qid]) return -EINVAL; - ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]); + xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR); ice_xsk_remove_umem(vsi, qid); return 0; @@ -408,7 +327,6 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid) static int ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) { - struct xdp_umem_fq_reuse *reuseq; int err; if (vsi->type != ICE_VSI_PF) @@ -419,20 +337,18 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid) if (qid >= vsi->num_xsk_umems) return -EINVAL; + err = ice_xsk_alloc_umems(vsi); + if (err) + return err; + if (vsi->xsk_umems && vsi->xsk_umems[qid]) return -EBUSY; - reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = ice_xsk_umem_dma_map(vsi, umem); - if (err) - return err; + vsi->xsk_umems[qid] = umem; + vsi->num_xsk_umems_used++; - err = ice_xsk_add_umem(vsi, umem, qid); + err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back), + ICE_RX_DMA_ATTR); if (err) return err; @@ -484,137 +400,22 @@ xsk_umem_if_up: } /** - * ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations - * @zca: zero-cpoy allocator - * @handle: Buffer handle - */ -void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle) -{ - struct ice_rx_buf *rx_buf; - struct ice_ring *rx_ring; - struct xdp_umem *umem; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(zca, struct ice_ring, zca); - umem = rx_ring->xsk_umem; - hr = umem->headroom + XDP_PACKET_HEADROOM; - - mask = umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - rx_buf = &rx_ring->rx_buf[nta]; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += hr; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += hr; - - rx_buf->handle = (u64)handle + umem->headroom; -} - -/** - * ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem - * @rx_ring: ring with an xdp_umem bound to it - * @rx_buf: buffer to which xsk page address will be assigned - * - * This function allocates an Rx buffer in the hot path. - * The buffer can come from fill queue or recycle queue. - * - * Returns true if an assignment was successful, false if not. - */ -static __always_inline bool -ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = rx_buf->addr; - u64 handle, hr; - - if (addr) { - rx_ring->rx_stats.page_reuse_count++; - return true; - } - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += hr; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += hr; - - rx_buf->handle = handle + umem->headroom; - - xsk_umem_release_addr(umem); - return true; -} - -/** - * ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem - * @rx_ring: ring with an xdp_umem bound to it - * @rx_buf: buffer to which xsk page address will be assigned - * - * This function allocates an Rx buffer in the slow path. - * The buffer can come from fill queue or recycle queue. - * - * Returns true if an assignment was successful, false if not. - */ -static __always_inline bool -ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, headroom; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_page_failed++; - return false; - } - - handle &= umem->chunk_mask; - headroom = umem->headroom + XDP_PACKET_HEADROOM; - - rx_buf->dma = xdp_umem_get_dma(umem, handle); - rx_buf->dma += headroom; - - rx_buf->addr = xdp_umem_get_data(umem, handle); - rx_buf->addr += headroom; - - rx_buf->handle = handle + umem->headroom; - - xsk_umem_release_addr_rq(umem); - return true; -} - -/** * ice_alloc_rx_bufs_zc - allocate a number of Rx buffers * @rx_ring: Rx ring * @count: The number of buffers to allocate - * @alloc: the function pointer to call for allocation * * This function allocates a number of Rx buffers from the fill ring * or the internal recycle mechanism and places them on the Rx ring. * * Returns false if all allocations were successful, true if any fail. */ -static bool -ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, - bool (*alloc)(struct ice_ring *, struct ice_rx_buf *)) +bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count) { union ice_32b_rx_flex_desc *rx_desc; u16 ntu = rx_ring->next_to_use; struct ice_rx_buf *rx_buf; bool ret = false; + dma_addr_t dma; if (!count) return false; @@ -623,16 +424,14 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, rx_buf = &rx_ring->rx_buf[ntu]; do { - if (!alloc(rx_ring, rx_buf)) { + rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!rx_buf->xdp) { ret = true; break; } - dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); - - rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma); + dma = xsk_buff_xdp_get_dma(rx_buf->xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); rx_desc->wb.status_error0 = 0; rx_desc++; @@ -653,32 +452,6 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count, } /** - * ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path - * @rx_ring: Rx ring - * @count: number of bufs to allocate - * - * Returns false on success, true on failure. - */ -static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count) -{ - return ice_alloc_rx_bufs_zc(rx_ring, count, - ice_alloc_buf_fast_zc); -} - -/** - * ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path - * @rx_ring: Rx ring - * @count: number of bufs to allocate - * - * Returns false on success, true on failure. - */ -bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count) -{ - return ice_alloc_rx_bufs_zc(rx_ring, count, - ice_alloc_buf_slow_zc); -} - -/** * ice_bump_ntc - Bump the next_to_clean counter of an Rx ring * @rx_ring: Rx ring */ @@ -692,76 +465,21 @@ static void ice_bump_ntc(struct ice_ring *rx_ring) } /** - * ice_get_rx_buf_zc - Fetch the current Rx buffer - * @rx_ring: Rx ring - * @size: size of a buffer - * - * This function returns the current, received Rx buffer and does - * DMA synchronization. - * - * Returns a pointer to the received Rx buffer. - */ -static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size) -{ - struct ice_rx_buf *rx_buf; - - rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; - - dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0, - size, DMA_BIDIRECTIONAL); - - return rx_buf; -} - -/** - * ice_reuse_rx_buf_zc - reuse an Rx buffer - * @rx_ring: Rx ring - * @old_buf: The buffer to recycle - * - * This function recycles a finished Rx buffer, and places it on the recycle - * queue (next_to_alloc). - */ -static void -ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf) -{ - unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask; - u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - u16 nta = rx_ring->next_to_alloc; - struct ice_rx_buf *new_buf; - - new_buf = &rx_ring->rx_buf[nta++]; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - new_buf->dma = old_buf->dma & mask; - new_buf->dma += hr; - - new_buf->addr = (void *)((unsigned long)old_buf->addr & mask); - new_buf->addr += hr; - - new_buf->handle = old_buf->handle & mask; - new_buf->handle += rx_ring->xsk_umem->headroom; - - old_buf->addr = NULL; -} - -/** * ice_construct_skb_zc - Create an sk_buff from zero-copy buffer * @rx_ring: Rx ring * @rx_buf: zero-copy Rx buffer - * @xdp: XDP buffer * * This function allocates a new skb from a zero-copy Rx buffer. * * Returns the skb on success, NULL on failure. */ static struct sk_buff * -ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, - struct xdp_buff *xdp) +ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) { - unsigned int metasize = xdp->data - xdp->data_meta; - unsigned int datasize = xdp->data_end - xdp->data; - unsigned int datasize_hard = xdp->data_end - - xdp->data_hard_start; + unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta; + unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data; + unsigned int datasize_hard = rx_buf->xdp->data_end - + rx_buf->xdp->data_hard_start; struct sk_buff *skb; skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard, @@ -769,13 +487,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdp->data - xdp->data_hard_start); - memcpy(__skb_put(skb, datasize), xdp->data, datasize); + skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start); + memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize); if (metasize) skb_metadata_set(skb, metasize); - ice_reuse_rx_buf_zc(rx_ring, rx_buf); - + xsk_buff_free(rx_buf->xdp); + rx_buf->xdp = NULL; return skb; } @@ -802,7 +520,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp) } act = bpf_prog_run_xdp(xdp_prog, xdp); - xdp->handle += xdp->data - xdp->data_hard_start; switch (act) { case XDP_PASS: break; @@ -840,13 +557,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_xmit = 0; bool failure = false; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; @@ -858,8 +570,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) u8 rx_ptype; if (cleaned_count >= ICE_RX_BUF_WRITE) { - failure |= ice_alloc_rx_bufs_fast_zc(rx_ring, - cleaned_count); + failure |= ice_alloc_rx_bufs_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -880,25 +592,19 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) if (!size) break; - rx_buf = ice_get_rx_buf_zc(rx_ring, size); - if (!rx_buf->addr) - break; - xdp.data = rx_buf->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = rx_buf->handle; + rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; + rx_buf->xdp->data_end = rx_buf->xdp->data + size; + xsk_buff_dma_sync_for_cpu(rx_buf->xdp); - xdp_res = ice_run_xdp_zc(rx_ring, &xdp); + xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp); if (xdp_res) { - if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { + if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) xdp_xmit |= xdp_res; - rx_buf->addr = NULL; - } else { - ice_reuse_rx_buf_zc(rx_ring, rx_buf); - } + else + xsk_buff_free(rx_buf->xdp); + rx_buf->xdp = NULL; total_rx_bytes += size; total_rx_packets++; cleaned_count++; @@ -908,7 +614,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) } /* XDP_PASS path */ - skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp); + skb = ice_construct_skb_zc(rx_ring, rx_buf); if (!skb) { rx_ring->rx_stats.alloc_buf_failed++; break; @@ -979,10 +685,9 @@ static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_buf->bytecount = desc.len; @@ -1165,11 +870,10 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring) for (i = 0; i < rx_ring->count; i++) { struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i]; - if (!rx_buf->addr) + if (!rx_buf->xdp) continue; - xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle); - rx_buf->addr = NULL; + rx_buf->xdp = NULL; } } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.h b/drivers/net/ethernet/intel/ice/ice_xsk.h index 8a4ba7c6d549..fc1a06b4df36 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.h +++ b/drivers/net/ethernet/intel/ice/ice_xsk.h @@ -10,11 +10,10 @@ struct ice_vsi; #ifdef CONFIG_XDP_SOCKETS int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid); -void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle); int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget); bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget); int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags); -bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count); +bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count); bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi); void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring); void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring); @@ -27,12 +26,6 @@ ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi, return -EOPNOTSUPP; } -static inline void -ice_zca_free(struct zero_copy_allocator __always_unused *zca, - unsigned long __always_unused handle) -{ -} - static inline int ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring, int __always_unused budget) @@ -48,8 +41,8 @@ ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring, } static inline bool -ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring, - u16 __always_unused count) +ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring, + u16 __always_unused count) { return false; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 2833e4f041ce..5ddfc83a1e46 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -224,17 +224,17 @@ struct ixgbe_tx_buffer { }; struct ixgbe_rx_buffer { - struct sk_buff *skb; - dma_addr_t dma; union { struct { + struct sk_buff *skb; + dma_addr_t dma; struct page *page; __u32 page_offset; __u16 pagecnt_bias; }; struct { - void *addr; - u64 handle; + bool discard; + struct xdp_buff *xdp; }; }; }; @@ -351,7 +351,6 @@ struct ixgbe_ring { }; struct xdp_rxq_info xdp_rxq; struct xdp_umem *xsk_umem; - struct zero_copy_allocator zca; /* ZC allocator anchor */ u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */ u16 rx_buf_len; } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index eab5934b04f5..45fc7ce1a543 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -35,7 +35,7 @@ #include <net/tc_act/tc_mirred.h> #include <net/vxlan.h> #include <net/mpls.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xfrm.h> #include "ixgbe.h" @@ -3745,8 +3745,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter, /* configure the packet buffer length */ if (rx_ring->xsk_umem) { - u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem); /* If the MAC support setting RXDCTL.RLPML, the * SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and @@ -4093,11 +4092,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ring->xsk_umem = ixgbe_xsk_umem(adapter, ring); if (ring->xsk_umem) { - ring->zca.free = ixgbe_zca_free; WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &ring->zca)); - + MEM_TYPE_XSK_BUFF_POOL, + NULL)); + xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq); } else { WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL)); @@ -4153,8 +4151,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter, } if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) { - u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr - - XDP_PACKET_HEADROOM; + u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem); rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK | IXGBE_RXDCTL_RLPML_EN); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h index 6d01700b46bc..7887ae4aaf4f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h @@ -35,7 +35,7 @@ int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem, void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); -void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count); +bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count); int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, struct ixgbe_ring *rx_ring, const int budget); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index a656ee9a1fae..86add9fbd36c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -2,7 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "ixgbe.h" @@ -20,54 +20,11 @@ struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter, return xdp_get_umem_from_qid(adapter->netdev, qid); } -static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter, - struct xdp_umem *umem) -{ - struct device *dev = &adapter->pdev->dev; - unsigned int i, j; - dma_addr_t dma; - - for (i = 0; i < umem->npgs; i++) { - dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - if (dma_mapping_error(dev, dma)) - goto out_unmap; - - umem->pages[i].dma = dma; - } - - return 0; - -out_unmap: - for (j = 0; j < i; j++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - umem->pages[i].dma = 0; - } - - return -1; -} - -static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter, - struct xdp_umem *umem) -{ - struct device *dev = &adapter->pdev->dev; - unsigned int i; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR); - - umem->pages[i].dma = 0; - } -} - static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter, struct xdp_umem *umem, u16 qid) { struct net_device *netdev = adapter->netdev; - struct xdp_umem_fq_reuse *reuseq; bool if_running; int err; @@ -78,13 +35,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter, qid >= netdev->real_num_tx_queues) return -EINVAL; - reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count); - if (!reuseq) - return -ENOMEM; - - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - err = ixgbe_xsk_umem_dma_map(adapter, umem); + err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR); if (err) return err; @@ -124,7 +75,7 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid) ixgbe_txrx_ring_disable(adapter, qid); clear_bit(qid, adapter->af_xdp_zc_qps); - ixgbe_xsk_umem_dma_unmap(adapter, umem); + xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR); if (if_running) ixgbe_txrx_ring_enable(adapter, qid); @@ -143,19 +94,14 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, struct ixgbe_ring *rx_ring, struct xdp_buff *xdp) { - struct xdp_umem *umem = rx_ring->xsk_umem; int err, result = IXGBE_XDP_PASS; struct bpf_prog *xdp_prog; struct xdp_frame *xdpf; - u64 offset; u32 act; rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); act = bpf_prog_run_xdp(xdp_prog, xdp); - offset = xdp->data - xdp->data_hard_start; - - xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset); switch (act) { case XDP_PASS: @@ -186,140 +132,16 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, return result; } -static struct -ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring, - unsigned int size) -{ - struct ixgbe_rx_buffer *bi; - - bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - bi->dma, 0, - size, - DMA_BIDIRECTIONAL); - - return bi; -} - -static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *obi) -{ - u16 nta = rx_ring->next_to_alloc; - struct ixgbe_rx_buffer *nbi; - - nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc]; - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - nbi->dma = obi->dma; - nbi->addr = obi->addr; - nbi->handle = obi->handle; - - obi->addr = NULL; - obi->skb = NULL; -} - -void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle) -{ - struct ixgbe_rx_buffer *bi; - struct ixgbe_ring *rx_ring; - u64 hr, mask; - u16 nta; - - rx_ring = container_of(alloc, struct ixgbe_ring, zca); - hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM; - mask = rx_ring->xsk_umem->chunk_mask; - - nta = rx_ring->next_to_alloc; - bi = rx_ring->rx_buffer_info; - - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - handle &= mask; - - bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle, - rx_ring->xsk_umem->headroom); -} - -static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - void *addr = bi->addr; - u64 handle, hr; - - if (addr) - return true; - - if (!xsk_umem_peek_addr(umem, &handle)) { - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr(umem); - return true; -} - -static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi) -{ - struct xdp_umem *umem = rx_ring->xsk_umem; - u64 handle, hr; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) { - rx_ring->rx_stats.alloc_rx_page_failed++; - return false; - } - - handle &= rx_ring->xsk_umem->chunk_mask; - - hr = umem->headroom + XDP_PACKET_HEADROOM; - - bi->dma = xdp_umem_get_dma(umem, handle); - bi->dma += hr; - - bi->addr = xdp_umem_get_data(umem, handle); - bi->addr += hr; - - bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom); - - xsk_umem_release_addr_rq(umem); - return true; -} - -static __always_inline bool -__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, - bool alloc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi)) +bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count) { union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *bi; u16 i = rx_ring->next_to_use; + dma_addr_t dma; bool ok = true; /* nothing to do */ - if (!cleaned_count) + if (!count) return true; rx_desc = IXGBE_RX_DESC(rx_ring, i); @@ -327,21 +149,18 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, i -= rx_ring->count; do { - if (!alloc(rx_ring, bi)) { + bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem); + if (!bi->xdp) { ok = false; break; } - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, - bi->page_offset, - rx_ring->rx_buf_len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_xdp_get_dma(bi->xdp); /* Refresh the desc even if buffer_addrs didn't change * because each write-back erases this info. */ - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma); + rx_desc->read.pkt_addr = cpu_to_le64(dma); rx_desc++; bi++; @@ -355,17 +174,14 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, /* clear the length for the next_to_use descriptor */ rx_desc->wb.upper.length = 0; - cleaned_count--; - } while (cleaned_count); + count--; + } while (count); i += rx_ring->count; if (rx_ring->next_to_use != i) { rx_ring->next_to_use = i; - /* update next to alloc since we have filled the ring */ - rx_ring->next_to_alloc = i; - /* Force memory writes to complete before letting h/w * know there are new descriptors to fetch. (Only * applicable for weak-ordered memory model archs, @@ -378,40 +194,27 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count, return ok; } -void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count) -{ - __ixgbe_alloc_rx_buffers_zc(rx_ring, count, - ixgbe_alloc_buffer_slow_zc); -} - -static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring, - u16 count) -{ - return __ixgbe_alloc_rx_buffers_zc(rx_ring, count, - ixgbe_alloc_buffer_zc); -} - static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, - struct ixgbe_rx_buffer *bi, - struct xdp_buff *xdp) + struct ixgbe_rx_buffer *bi) { - unsigned int metasize = xdp->data - xdp->data_meta; - unsigned int datasize = xdp->data_end - xdp->data; + unsigned int metasize = bi->xdp->data - bi->xdp->data_meta; + unsigned int datasize = bi->xdp->data_end - bi->xdp->data; struct sk_buff *skb; /* allocate a skb to store the frags */ skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - xdp->data_end - xdp->data_hard_start, + bi->xdp->data_end - bi->xdp->data_hard_start, GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdp->data - xdp->data_hard_start); - memcpy(__skb_put(skb, datasize), xdp->data, datasize); + skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start); + memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize); if (metasize) skb_metadata_set(skb, metasize); - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(bi->xdp); + bi->xdp = NULL; return skb; } @@ -431,14 +234,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbe_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbe_desc_unused(rx_ring); - struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; - struct xdp_buff xdp; - - xdp.rxq = &rx_ring->xdp_rxq; - xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; @@ -448,8 +246,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, /* return some buffers to hardware, one at a time is too slow */ if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) { failure = failure || - !ixgbe_alloc_rx_buffers_fast_zc(rx_ring, - cleaned_count); + !ixgbe_alloc_rx_buffers_zc(rx_ring, + cleaned_count); cleaned_count = 0; } @@ -464,42 +262,40 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, */ dma_rmb(); - bi = ixgbe_get_rx_buffer_zc(rx_ring, size); + bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; if (unlikely(!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) { struct ixgbe_rx_buffer *next_bi; - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + xsk_buff_free(bi->xdp); + bi->xdp = NULL; ixgbe_inc_ntc(rx_ring); next_bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - next_bi->skb = ERR_PTR(-EINVAL); + next_bi->discard = true; continue; } - if (unlikely(bi->skb)) { - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); + if (unlikely(bi->discard)) { + xsk_buff_free(bi->xdp); + bi->xdp = NULL; + bi->discard = false; ixgbe_inc_ntc(rx_ring); continue; } - xdp.data = bi->addr; - xdp.data_meta = xdp.data; - xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM; - xdp.data_end = xdp.data + size; - xdp.handle = bi->handle; - - xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp); + bi->xdp->data_end = bi->xdp->data + size; + xsk_buff_dma_sync_for_cpu(bi->xdp); + xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp); if (xdp_res) { - if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) { + if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) xdp_xmit |= xdp_res; - bi->addr = NULL; - bi->skb = NULL; - } else { - ixgbe_reuse_rx_buffer_zc(rx_ring, bi); - } + else + xsk_buff_free(bi->xdp); + + bi->xdp = NULL; total_rx_packets++; total_rx_bytes += size; @@ -509,7 +305,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, } /* XDP_PASS path */ - skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp); + skb = ixgbe_construct_skb_zc(rx_ring, bi); if (!skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; break; @@ -561,17 +357,17 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring) { - u16 i = rx_ring->next_to_clean; - struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i]; + struct ixgbe_rx_buffer *bi; + u16 i; - while (i != rx_ring->next_to_alloc) { - xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle); - i++; - bi++; - if (i == rx_ring->count) { - i = 0; - bi = rx_ring->rx_buffer_info; - } + for (i = 0; i < rx_ring->count; i++) { + bi = &rx_ring->rx_buffer_info[i]; + + if (!bi->xdp) + continue; + + xsk_buff_free(bi->xdp); + bi->xdp = NULL; } } @@ -594,10 +390,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc)) break; - dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr); - - dma_sync_single_for_device(xdp_ring->dev, dma, desc.len, - DMA_BIDIRECTIONAL); + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma, + desc.len); tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use]; tx_bi->bytecount = desc.len; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 81fd53569463..4906aee6798d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -365,10 +365,7 @@ struct mlx5e_dma_info { dma_addr_t addr; union { struct page *page; - struct { - u64 handle; - void *data; - } xsk; + struct xdp_buff *xsk; }; }; @@ -581,7 +578,6 @@ struct mlx5e_rq { } mpwqe; }; struct { - u16 umem_headroom; u16 headroom; u32 frame0_sz; u8 map_dir; /* dma map direction */ @@ -614,7 +610,6 @@ struct mlx5e_rq { struct page_pool *page_pool; /* AF_XDP zero-copy */ - struct zero_copy_allocator zca; struct xdp_umem *umem; struct work_struct recover_work; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index eb2e1f2138e4..38e4f19d69f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params, u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { - u16 headroom = NET_IP_ALIGN; + u16 headroom; - if (mlx5e_rx_is_xdp(params, xsk)) { + if (xsk) + return xsk->headroom; + + headroom = NET_IP_ALIGN; + if (mlx5e_rx_is_xdp(params, xsk)) headroom += XDP_PACKET_HEADROOM; - if (xsk) - headroom += xsk->headroom; - } else { + else headroom += MLX5_RX_HEADROOM; - } return headroom; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 42202d19245c..3bea1d4be53b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -31,7 +31,7 @@ */ #include <linux/bpf_trace.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "en/xdp.h" #include "en/params.h" @@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, xdptxd.data = xdpf->data; xdptxd.len = xdpf->len; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) { + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { /* The xdp_buff was in the UMEM and was copied into a newly * allocated page. The UMEM page was returned via the ZCA, and * this new page has to be mapped at this point and has to be @@ -119,50 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, /* returns true if packet was consumed by xdp */ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, - void *va, u16 *rx_headroom, u32 *len, bool xsk) + u32 *len, struct xdp_buff *xdp) { struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); - struct xdp_umem *umem = rq->umem; - struct xdp_buff xdp; u32 act; int err; if (!prog) return false; - xdp.data = va + *rx_headroom; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = xdp.data + *len; - xdp.data_hard_start = va; - if (xsk) - xdp.handle = di->xsk.handle; - xdp.rxq = &rq->xdp_rxq; - xdp.frame_sz = rq->buff.frame0_sz; - - act = bpf_prog_run_xdp(prog, &xdp); - if (xsk) { - u64 off = xdp.data - xdp.data_hard_start; - - xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off); - } + act = bpf_prog_run_xdp(prog, xdp); switch (act) { case XDP_PASS: - *rx_headroom = xdp.data - xdp.data_hard_start; - *len = xdp.data_end - xdp.data; + *len = xdp->data_end - xdp->data; return false; case XDP_TX: - if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp))) + if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp))) goto xdp_abort; __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ return true; case XDP_REDIRECT: /* When XDP enabled then page-refcnt==1 here */ - err = xdp_do_redirect(rq->netdev, &xdp, prog); + err = xdp_do_redirect(rq->netdev, xdp, prog); if (unlikely(err)) goto xdp_abort; __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); - if (!xsk) + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) mlx5e_page_dma_unmap(rq, di); rq->stats->xdp_redirect++; return true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index be64eb68f4e5..ca48c293151b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -61,7 +61,7 @@ struct mlx5e_xsk_param; int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, - void *va, u16 *rx_headroom, u32 *len, bool xsk); + u32 *len, struct xdp_buff *xdp); void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq); bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq); void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c index 62fc8a128a8d..a33a1f762c70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -3,71 +3,10 @@ #include "rx.h" #include "en/xdp.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* RX data path */ -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count) -{ - /* Check in advance that we have enough frames, instead of allocating - * one-by-one, failing and moving frames to the Reuse Ring. - */ - return xsk_umem_has_addrs_rq(rq->umem, count); -} - -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - struct xdp_umem *umem = rq->umem; - u64 handle; - - if (!xsk_umem_peek_addr_rq(umem, &handle)) - return -ENOMEM; - - dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle, - rq->buff.umem_headroom); - dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle); - - /* No need to add headroom to the DMA address. In striding RQ case, we - * just provide pages for UMR, and headroom is counted at the setup - * stage when creating a WQE. In non-striding RQ case, headroom is - * accounted in mlx5e_alloc_rx_wqe. - */ - dma_info->addr = xdp_umem_get_dma(umem, handle); - - xsk_umem_release_addr_rq(umem); - - dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE, - DMA_BIDIRECTIONAL); - - return 0; -} - -static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle) -{ - xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask); -} - -/* XSKRQ uses pages from UMEM, they must not be released. They are returned to - * the userspace if possible, and if not, this function is called to reuse them - * in the driver. - */ -void mlx5e_xsk_page_release(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info) -{ - mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle); -} - -/* Return a frame back to the hardware to fill in again. It is used by XDP when - * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP. - */ -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle) -{ - struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca); - - mlx5e_xsk_recycle_frame(rq, handle); -} - static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data, u32 cqe_bcnt) { @@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, u32 head_offset, u32 page_idx) { - struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; - u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; + struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk; u32 cqe_bcnt32 = cqe_bcnt; - void *va, *data; - u32 frag_size; bool consumed; /* Check packet size. Note LRO doesn't use linear SKB */ @@ -103,22 +39,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, return NULL; } - /* head_offset is not used in this function, because di->xsk.data and - * di->addr point directly to the necessary place. Furthermore, in the - * current implementation, UMR pages are mapped to XSK frames, so + /* head_offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, in + * the current implementation, UMR pages are mapped to XSK frames, so * head_offset should always be 0. */ WARN_ON_ONCE(head_offset); - va = di->xsk.data; - data = va + rx_headroom; - frag_size = rq->buff.headroom + cqe_bcnt32; - - dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); - prefetch(data); + xdp->data_end = xdp->data + cqe_bcnt32; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp); + prefetch(xdp->data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true); + consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp); rcu_read_unlock(); /* Possible flows: @@ -145,7 +79,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the * frame. On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32); + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32); } struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, @@ -153,25 +87,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) { - struct mlx5e_dma_info *di = wi->di; - u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; - void *va, *data; + struct xdp_buff *xdp = wi->di->xsk; bool consumed; - u32 frag_size; - /* wi->offset is not used in this function, because di->xsk.data and - * di->addr point directly to the necessary place. Furthermore, in the - * current implementation, one page = one packet = one frame, so + /* wi->offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, the + * XSK allocator allocates frames per packet, instead of pages, so * wi->offset should always be 0. */ WARN_ON_ONCE(wi->offset); - va = di->xsk.data; - data = va + rx_headroom; - frag_size = rq->buff.headroom + cqe_bcnt; - - dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); - prefetch(data); + xdp->data_end = xdp->data + cqe_bcnt; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp); + prefetch(xdp->data); if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) { rq->stats->wqe_err++; @@ -179,7 +108,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, } rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true); + consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp); rcu_read_unlock(); if (likely(consumed)) @@ -189,5 +118,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, * will be handled by mlx5e_put_rx_frag. * On SKB allocation failure, NULL is returned. */ - return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt); + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h index cab0e93497ae..d147b2f13b54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -5,16 +5,10 @@ #define __MLX5_EN_XSK_RX_H__ #include "en.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* RX data path */ -bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count); -int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info); -void mlx5e_xsk_page_release(struct mlx5e_rq *rq, - struct mlx5e_dma_info *dma_info); -void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle); struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, u16 cqe_bcnt, @@ -25,6 +19,23 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt); +static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + dma_info->xsk = xsk_buff_alloc(rq->umem); + if (!dma_info->xsk) + return -ENOMEM; + + /* Store the DMA address without headroom. In striding RQ case, we just + * provide pages for UMR, and headroom is counted at the setup stage + * when creating a WQE. In non-striding RQ case, headroom is accounted + * in mlx5e_alloc_rx_wqe. + */ + dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk); + + return 0; +} + static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err) { if (!xsk_umem_uses_need_wakeup(rq->umem)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index 3bcdb5b2fc20..83dce9cdb8c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -5,7 +5,7 @@ #include "umem.h" #include "en/xdp.h" #include "en/params.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { @@ -92,12 +92,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) break; } - xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr); - xdptxd.data = xdp_umem_get_data(umem, desc.addr); + xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr); + xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr); xdptxd.len = desc.len; - dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr, - xdptxd.len, DMA_BIDIRECTIONAL); + xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len); if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) { if (sq->mpwqe.wqe) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h index 79b487d89757..39fa0a705856 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h @@ -5,7 +5,7 @@ #define __MLX5_EN_XSK_TX_H__ #include "en.h" -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> /* TX data path */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c index 4baaa5788320..7b17fcd0a56d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2019 Mellanox Technologies. */ -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "umem.h" #include "setup.h" #include "en/params.h" @@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv, struct xdp_umem *umem) { struct device *dev = priv->mdev->device; - u32 i; - for (i = 0; i < umem->npgs; i++) { - dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE, - DMA_BIDIRECTIONAL); - - if (unlikely(dma_mapping_error(dev, dma))) - goto err_unmap; - umem->pages[i].dma = dma; - } - - return 0; - -err_unmap: - while (i--) { - dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL); - umem->pages[i].dma = 0; - } - - return -ENOMEM; + return xsk_buff_dma_map(umem, dev, 0); } static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv, struct xdp_umem *umem) { - struct device *dev = priv->mdev->device; - u32 i; - - for (i = 0; i < umem->npgs; i++) { - dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, - DMA_BIDIRECTIONAL); - umem->pages[i].dma = 0; - } + return xsk_buff_dma_unmap(umem, 0); } static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk) @@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix) static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem) { - return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff; + return xsk_umem_get_headroom(umem) <= 0xffff && + xsk_umem_get_chunk_size(umem) <= 0xffff; } void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk) { - xsk->headroom = umem->headroom; - xsk->chunk_size = umem->chunk_size_nohr + umem->headroom; + xsk->headroom = xsk_umem_get_headroom(umem); + xsk->chunk_size = xsk_umem_get_chunk_size(umem); } static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, @@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid) mlx5e_xsk_disable_umem(priv, ix); } -int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries) -{ - struct xdp_umem_fq_reuse *reuseq; - - reuseq = xsk_reuseq_prepare(nentries); - if (unlikely(!reuseq)) - return -ENOMEM; - xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); - - return 0; -} - u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk) { u16 res = xsk->refcnt ? params->num_channels : 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07823abe5557..8b86c6ded302 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -38,7 +38,7 @@ #include <linux/bpf.h> #include <linux/if_bridge.h> #include <net/page_pool.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "eswitch.h" #include "en.h" #include "en/txrx.h" @@ -373,7 +373,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->mdev; void *rqc = rqp->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); - u32 num_xsk_frames = 0; u32 rq_xdp_ix; u32 pool_size; int wq_sz; @@ -413,7 +412,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk); - rq->buff.umem_headroom = xsk ? xsk->headroom : 0; pool_size = 1 << params->log_rq_mtu_frames; switch (rq->wq_type) { @@ -427,10 +425,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); - if (xsk) - num_xsk_frames = wq_sz << - mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); - pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params, xsk); @@ -482,9 +476,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq); - if (xsk) - num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; - rq->wqe.info = rqp->frags_info; rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; @@ -525,19 +516,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, } if (xsk) { - rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem); - - err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames); - if (unlikely(err)) { - mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", - num_xsk_frames); - goto err_free; - } - - rq->zca.free = mlx5e_xsk_zca_free; err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_ZERO_COPY, - &rq->zca); + MEM_TYPE_XSK_BUFF_POOL, NULL); + xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq); } else { /* Create a page_pool and register it with rxq */ pp_params.order = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index a514685fb560..fdba52136c5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -300,7 +300,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq, * put into the Reuse Ring, because there is no way to return * the page to the userspace when the interface goes down. */ - mlx5e_xsk_page_release(rq, dma_info); + xsk_buff_free(dma_info->xsk); else mlx5e_page_release_dynamic(rq, dma_info, recycle); } @@ -385,7 +385,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk) if (rq->umem) { int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags; - if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired))) + /* Check in advance that we have enough frames, instead of + * allocating one-by-one, failing and moving frames to the + * Reuse Ring. + */ + if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired))) return -ENOMEM; } @@ -480,8 +484,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) int err; int i; + /* Check in advance that we have enough frames, instead of allocating + * one-by-one, failing and moving frames to the Reuse Ring. + */ if (rq->umem && - unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) { + unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) { err = -ENOMEM; goto err; } @@ -1044,12 +1051,24 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, return skb; } +static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, + u32 len, struct xdp_buff *xdp) +{ + xdp->data_hard_start = va; + xdp_set_data_meta_invalid(xdp); + xdp->data = va + headroom; + xdp->data_end = xdp->data + len; + xdp->rxq = &rq->xdp_rxq; + xdp->frame_sz = rq->buff.frame0_sz; +} + struct sk_buff * mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) { struct mlx5e_dma_info *di = wi->di; u16 rx_headroom = rq->buff.headroom; + struct xdp_buff xdp; struct sk_buff *skb; void *va, *data; bool consumed; @@ -1065,11 +1084,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, prefetch(data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false); + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); + consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp); rcu_read_unlock(); if (consumed) return NULL; /* page/packet was consumed by XDP */ + rx_headroom = xdp.data - xdp.data_hard_start; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); if (unlikely(!skb)) @@ -1343,6 +1364,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; u16 rx_headroom = rq->buff.headroom; u32 cqe_bcnt32 = cqe_bcnt; + struct xdp_buff xdp; struct sk_buff *skb; void *va, *data; u32 frag_size; @@ -1364,7 +1386,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, prefetch(data); rcu_read_lock(); - consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false); + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp); + consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp); rcu_read_unlock(); if (consumed) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) @@ -1372,6 +1395,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* page/packet was consumed by XDP */ } + rx_headroom = xdp.data - xdp.data_hard_start; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); if (unlikely(!skb)) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 1e0c024b0a93..8e4141552423 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; xdp->frame_sz = PAGE_SIZE; - xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/net/xdp.h b/include/net/xdp.h index 3094fccf5a88..90f11760bd12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -39,7 +39,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, - MEM_TYPE_ZERO_COPY, + MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -54,10 +54,6 @@ struct xdp_mem_info { struct page_pool; -struct zero_copy_allocator { - void (*free)(struct zero_copy_allocator *zca, unsigned long handle); -}; - struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -70,7 +66,6 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; - unsigned long handle; struct xdp_rxq_info *rxq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; @@ -119,7 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index abd72de25fa4..96bfc5f5f24e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -15,40 +15,15 @@ struct net_device; struct xsk_queue; - -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT) - -struct xdp_umem_page { - void *addr; - dma_addr_t dma; -}; - -struct xdp_umem_fq_reuse { - u32 nentries; - u32 length; - u64 handles[]; -}; - -/* Flags for the umem flags field. - * - * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public - * flags. See inlude/uapi/include/linux/if_xdp.h. - */ -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1) +struct xdp_buff; struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct xdp_umem_page *pages; - u64 chunk_mask; + struct xsk_buff_pool *pool; u64 size; u32 headroom; - u32 chunk_size_nohr; + u32 chunk_size; struct user_struct *user; refcount_t users; struct work_struct work; @@ -59,28 +34,17 @@ struct xdp_umem { u8 flags; int id; struct net_device *dev; - struct xdp_umem_fq_reuse *fq_reuse; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; }; -/* Nodes are linked in the struct xdp_sock map_list field, and used to - * track which maps a certain socket reside in. - */ - struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ struct xdp_sock *xsk_map[]; }; -struct xsk_map_node { - struct list_head node; - struct xsk_map *map; - struct xdp_sock **map_entry; -}; - struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -111,32 +75,9 @@ struct xdp_sock { spinlock_t map_list_lock; }; -struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS -int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); -/* Used from netdev driver */ -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, - struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); @@ -153,230 +94,13 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return xs; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); -} - -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr + umem->headroom; -} - #else + static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } -static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return false; -} - -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) -{ -} - -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) -{ - return false; -} - -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) -{ -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - return NULL; -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ -} - -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) -{ - return NULL; -} - -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return 0; -} - -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) -{ -} - -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) -{ - return false; -} - -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; @@ -391,6 +115,7 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, { return NULL; } + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h new file mode 100644 index 000000000000..ccf848f7efa4 --- /dev/null +++ b/include/net/xdp_sock_drv.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Interface for implementing AF_XDP zero-copy support in drivers. + * Copyright(c) 2020 Intel Corporation. + */ + +#ifndef _LINUX_XDP_SOCK_DRV_H +#define _LINUX_XDP_SOCK_DRV_H + +#include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#ifdef CONFIG_XDP_SOCKETS + +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); +void xsk_set_rx_need_wakeup(struct xdp_umem *umem); +void xsk_set_tx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); +void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); +bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); + +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return XDP_PACKET_HEADROOM + umem->headroom; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return umem->chunk_size; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ + xp_set_rxq_info(umem->pool, rxq); +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ + xp_dma_unmap(umem->pool, attrs); +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_dma(xskb); +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_frame_dma(xskb); +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return xp_alloc(umem->pool); +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return xp_can_alloc(umem->pool, count); +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_free(xskb); +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_dma(umem->pool, addr); +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_data(umem->pool, addr); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_dma_sync_for_cpu(xskb); +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ + xp_dma_sync_for_device(umem->pool, dma, size); +} + +#else + +static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ +} + +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, + struct xdp_desc *desc) +{ + return false; +} + +static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ +} + +static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + return NULL; +} + +static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +{ +} + +static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +{ + return false; +} + +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return NULL; +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return false; +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ +} + +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h new file mode 100644 index 000000000000..a4ff226505c9 --- /dev/null +++ b/include/net/xsk_buff_pool.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef XSK_BUFF_POOL_H_ +#define XSK_BUFF_POOL_H_ + +#include <linux/if_xdp.h> +#include <linux/types.h> +#include <linux/dma-mapping.h> +#include <net/xdp.h> + +struct xsk_buff_pool; +struct xdp_rxq_info; +struct xsk_queue; +struct xdp_desc; +struct device; +struct page; + +struct xdp_buff_xsk { + struct xdp_buff xdp; + dma_addr_t dma; + dma_addr_t frame_dma; + struct xsk_buff_pool *pool; + bool unaligned; + u64 orig_addr; + struct list_head free_list_node; +}; + +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + +/* AF_XDP core. */ +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned); +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +void xp_destroy(struct xsk_buff_pool *pool); +void xp_release(struct xdp_buff_xsk *xskb); + +/* AF_XDP, and XDP core. */ +void xp_free(struct xdp_buff_xsk *xskb); + +/* AF_XDP ZC drivers, via xdp_sock_buff.h */ +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages); +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); +static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} + +static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); +static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + xp_dma_sync_for_cpu_slow(xskb); +} + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); +static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, + dma_addr_t dma, size_t size) +{ + if (pool->cheap_dma) + return; + + xp_dma_sync_for_device_slow(pool, dma, size); +} + +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static inline u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} + +#endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b95d65e8c628..b73d3e141323 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,7 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) + FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 37b2d8620153..375b933010dd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -ifeq ($(CONFIG_XDP_SOCKETS),y) -obj-$(CONFIG_BPF_SYSCALL) += xskmap.o -endif obj-$(CONFIG_BPF_SYSCALL) += offload.o endif ifeq ($(CONFIG_PERF_EVENTS),y) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25b14ee0e26d..d2e27dba4ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool reg_type_not_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_MAP_VALUE || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_BTF_ID; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || @@ -6308,8 +6317,25 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, bool is_jmp32) { - if (__is_pointer_value(false, reg)) - return -1; + if (__is_pointer_value(false, reg)) { + if (!reg_type_not_null(reg->type)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } if (is_jmp32) return is_branch32_taken(reg, val, opcode); @@ -6808,7 +6834,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (pred >= 0) { - err = mark_chain_precision(env, insn->dst_reg); + /* If we get here with a dst_reg pointer type it is because + * above is_branch_taken() special cased the 0 comparison. + */ + if (!__is_pointer_value(false, dst_reg)) + err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) err = mark_chain_precision(env, insn->src_reg); if (err) @@ -7094,7 +7124,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: @@ -7120,10 +7154,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_TRACE_FEXIT: range = tnum_const(0); break; - case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: return 0; + case BPF_TRACE_ITER: + break; default: return -ENOTSUPP; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 30ba7d38941d..bfd4ccd80847 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + u32 user_size = kattr->test.data_size_in; void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); + if (user_size > size) + return ERR_PTR(-EMSGSIZE); + data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); - if (copy_from_user(data + headroom, data_in, size)) { + if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } @@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; - if (size > max_data_sz) - return -EINVAL; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/core/xdp.c b/net/core/xdp.c index 490b8f5fa8ee..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -109,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -143,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -301,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -358,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -383,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -493,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 3aa4975919d7..9ef54cdcf662 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index eeb1137a3f23..31e0b4e88a9d 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -24,7 +24,7 @@ #include <linux/sched/signal.h> #include <linux/net.h> #include <net/devlink.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> #include <generated/utsrelease.h> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c35a8b2e0499..02aa5cb3a4fd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -756,12 +756,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -782,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b69496eaf922..0625a97a8894 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -505,9 +505,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -532,9 +531,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 71e2bdafb2ce..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 37ace3bc0d48..19e59d1a5e9f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); - - xdp_umem_unmap_pages(umem); + xp_destroy(umem->pool); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -385,11 +349,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; @@ -407,18 +369,13 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { err = -ENOMEM; goto out_pin; } - - err = xdp_umem_map_pages(umem); - if (!err) - return 0; - - kvfree(umem->pages); + return 0; out_pin: xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index a63a9fb251f5..32067fe98f65 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,7 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45ffd67b367d..b6c0f08bd80d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,7 +22,7 @@ #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +void xp_release(struct xdp_buff_xsk *xskb) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} - return; - } +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - memcpy(to_buf, from_buf, len + metalen); + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; int err; - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { xs->rx_dropped++; - return -ENOSPC; + return err; } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; - metalen = 0; - } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; - } + xp_release(xskb); + return 0; +} - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +{ + void *from_buf, *to_buf; + u32 metalen; - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; + metalen = 0; + } else { + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -404,7 +359,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: @@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 4cfd106bdb53..455ddd480f3d 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,20 @@ #ifndef XSK_H_ #define XSK_H_ +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) + struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; @@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 cr; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + #endif /* XSK_H_ */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..540ed75e4482 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <net/xsk_buff_pool.h> +#include <net/xdp_sock.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/swiotlb.h> + +#include "xsk_queue.h" + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) +{ + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 57fb81bd593c..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,18 +6,10 @@ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> +#include <net/xdp_sock_drv.h> #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) -{ - if (!q) - return; - - q->umem_size = umem_size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 648733ec24ac..5b5d24d2dd37 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -29,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -103,98 +104,73 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->umem_size || addr >= q->umem_size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; + *addr = ring->desc[idx]; + return true; } - return true; + return false; } -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - if (addr >= q->umem_size) { - q->invalid_descs++; + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) return false; - } + if (desc->options) + return false; return true; } -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + u64 addr, base_addr; - *addr = ring->desc[idx] & q->chunk_mask; + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } + if (desc->len > pool->chunk_size) + return false; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; -out: - q->cached_cons++; - } + if (desc->options) + return false; + return true; +} - return false; +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; - - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } - - return true; - } - - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; - - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); + return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, @@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ diff --git a/kernel/bpf/xskmap.c b/net/xdp/xskmap.c index 2cc5c8f4c800..1dc7208c71ba 100644 --- a/kernel/bpf/xskmap.c +++ b/net/xdp/xskmap.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/sched.h> +#include "xsk.h" + int xsk_map_inc(struct xsk_map *map) { bpf_map_inc(&map->map); diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 23837f2ed458..034800c4d1e6 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -50,3 +50,4 @@ xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +testfile.img diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 424f6fe7ce38..8403e4762306 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -63,14 +63,14 @@ TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o fds_example-objs := fds_example.o sockex1-objs := sockex1_user.o sockex2-objs := sockex2_user.o -sockex3-objs := bpf_load.o sockex3_user.o -tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS) -tracex2-objs := bpf_load.o tracex2_user.o -tracex3-objs := bpf_load.o tracex3_user.o -tracex4-objs := bpf_load.o tracex4_user.o -tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS) -tracex6-objs := bpf_load.o tracex6_user.o -tracex7-objs := bpf_load.o tracex7_user.o +sockex3-objs := sockex3_user.o +tracex1-objs := tracex1_user.o $(TRACE_HELPERS) +tracex2-objs := tracex2_user.o +tracex3-objs := tracex3_user.o +tracex4-objs := tracex4_user.o +tracex5-objs := tracex5_user.o $(TRACE_HELPERS) +tracex6-objs := tracex6_user.o +tracex7-objs := tracex7_user.o test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) lathist-objs := bpf_load.o lathist_user.o diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c index e504dc308371..f24806ac24e7 100644 --- a/samples/bpf/sampleip_kern.c +++ b/samples/bpf/sampleip_kern.c @@ -13,12 +13,12 @@ #define MAX_IPS 8192 -struct bpf_map_def SEC("maps") ip_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(u64), - .value_size = sizeof(u32), - .max_entries = MAX_IPS, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u32); + __uint(max_entries, MAX_IPS); +} ip_map SEC(".maps"); SEC("perf_event") int do_sample(struct bpf_perf_event_data *ctx) diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 4372d2da2f9e..921c505bb567 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -18,9 +18,6 @@ #include "perf-sys.h" #include "trace_helpers.h" -#define __must_check -#include <linux/err.h> - #define DEFAULT_FREQ 99 #define DEFAULT_SECS 5 #define MAX_IPS 8192 @@ -57,7 +54,7 @@ static int sampling_start(int freq, struct bpf_program *prog, return 1; } links[i] = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(links[i])) { + if (libbpf_get_error(links[i])) { fprintf(stderr, "ERROR: Attach perf event\n"); links[i] = NULL; close(pmu_fd); @@ -182,7 +179,7 @@ int main(int argc, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); obj = bpf_object__open_file(filename, NULL); - if (IS_ERR(obj)) { + if (libbpf_get_error(obj)) { fprintf(stderr, "ERROR: opening BPF object file failed\n"); obj = NULL; goto cleanup; diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 779a5249c418..cab9cca0b8eb 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -19,12 +19,12 @@ #define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") jmp_table = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), - .max_entries = 8, -}; +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 8); +} jmp_table SEC(".maps"); #define PARSE_VLAN 1 #define PARSE_MPLS 2 @@ -92,12 +92,12 @@ struct globals { struct flow_key_record flow; }; -struct bpf_map_def SEC("maps") percpu_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(__u32), - .value_size = sizeof(struct globals), - .max_entries = 32, -}; +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct globals); + __uint(max_entries, 32); +} percpu_map SEC(".maps"); /* user poor man's per_cpu until native support is ready */ static struct globals *this_cpu_globals(void) @@ -113,12 +113,12 @@ struct pair { __u64 bytes; }; -struct bpf_map_def SEC("maps") hash_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct flow_key_record), - .value_size = sizeof(struct pair), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct flow_key_record); + __type(value, struct pair); + __uint(max_entries, 1024); +} hash_map SEC(".maps"); static void update_stats(struct __sk_buff *skb, struct globals *g) { diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index bbb1cd0666a9..4dbee7427d47 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -1,18 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <assert.h> -#include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> #include <sys/resource.h> -#define PARSE_IP 3 -#define PARSE_IP_PROG_FD (prog_fd[0]) -#define PROG_ARRAY_FD (map_fd[0]) - struct flow_key_record { __be32 src; __be32 dst; @@ -30,31 +25,55 @@ struct pair { int main(int argc, char **argv) { + int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + const char *title; FILE *f; - int i, sock, err, id, key = PARSE_IP; - struct bpf_prog_info info = {}; - uint32_t info_len = sizeof(info); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table"); + hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); + if (jmp_table_fd < 0 || hash_map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; } - /* Test fd array lookup which returns the id of the bpf_prog */ - err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len); - assert(!err); - err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id); - assert(!err); - assert(id == info.id); + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + + title = bpf_program__title(prog, false); + if (sscanf(title, "socket/%d", &key) != 1) { + fprintf(stderr, "ERROR: finding prog failed\n"); + goto cleanup; + } + + if (key == 0) + main_prog_fd = fd; + else + bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY); + } sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + /* attach BPF program to socket */ + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, sizeof(__u32)) == 0); if (argc > 1) @@ -69,8 +88,8 @@ int main(int argc, char **argv) sleep(1); printf("IP src.port -> dst.port bytes packets\n"); - while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[2], &next_key, &value); + while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(hash_map_fd, &next_key, &value); printf("%s.%05d -> %s.%05d %12lld %12lld\n", inet_ntoa((struct in_addr){htonl(next_key.src)}), next_key.port16[0], @@ -80,5 +99,8 @@ int main(int argc, char **argv) key = next_key; } } + +cleanup: + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h new file mode 100644 index 000000000000..8cb5400aed1f --- /dev/null +++ b/samples/bpf/trace_common.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_COMMON_H +#define __TRACE_COMMON_H + +#ifdef __x86_64__ +#define SYSCALL(SYS) "__x64_" __stringify(SYS) +#elif defined(__s390x__) +#define SYSCALL(SYS) "__s390x_" __stringify(SYS) +#else +#define SYSCALL(SYS) __stringify(SYS) +#endif + +#endif diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index da1d69e20645..7d3c66fb3f88 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -18,19 +18,19 @@ struct key_t { u32 userstack; }; -struct bpf_map_def SEC("maps") counts = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct key_t), - .value_size = sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, 10000); +} counts SEC(".maps"); -struct bpf_map_def SEC("maps") stackmap = { - .type = BPF_MAP_TYPE_STACK_TRACE, - .key_size = sizeof(u32), - .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), - .max_entries = 10000, -}; +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(u32)); + __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); + __uint(max_entries, 10000); +} stackmap SEC(".maps"); #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index b6cd358d0418..ac1ba368195c 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -16,9 +16,6 @@ #include "perf-sys.h" #include "trace_helpers.h" -#define __must_check -#include <linux/err.h> - #define SAMPLE_FREQ 50 static int pid; @@ -159,7 +156,7 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) goto all_cpu_err; } links[i] = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(links[i])) { + if (libbpf_get_error(links[i])) { printf("bpf_program__attach_perf_event failed\n"); links[i] = NULL; close(pmu_fd); @@ -198,7 +195,7 @@ static void test_perf_event_task(struct perf_event_attr *attr) goto err; } link = bpf_program__attach_perf_event(prog, pmu_fd); - if (IS_ERR(link)) { + if (libbpf_get_error(link)) { printf("bpf_program__attach_perf_event failed\n"); link = NULL; close(pmu_fd); @@ -314,7 +311,7 @@ int main(int argc, char **argv) } obj = bpf_object__open_file(filename, NULL); - if (IS_ERR(obj)) { + if (libbpf_get_error(obj)) { printf("opening BPF object file failed\n"); obj = NULL; goto cleanup; diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c index 55fddbd08702..9d4adb7fd834 100644 --- a/samples/bpf/tracex1_user.c +++ b/samples/bpf/tracex1_user.c @@ -1,21 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "trace_helpers.h" int main(int ac, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } f = popen("taskset 1 ping -c5 localhost", "r"); @@ -23,5 +43,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c index d865bb309bcb..5bc696bac27d 100644 --- a/samples/bpf/tracex2_kern.c +++ b/samples/bpf/tracex2_kern.c @@ -10,13 +10,14 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +#include "trace_common.h" -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, long); + __uint(max_entries, 1024); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -70,14 +71,14 @@ struct hist_key { u64 index; }; -struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_PERCPU_HASH, - .key_size = sizeof(struct hist_key), - .value_size = sizeof(long), - .max_entries = 1024, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct hist_key)); + __uint(value_size, sizeof(long)); + __uint(max_entries, 1024); +} my_hist_map SEC(".maps"); -SEC("kprobe/sys_write") +SEC("kprobe/" SYSCALL(sys_write)) int bpf_prog3(struct pt_regs *ctx) { long write_size = PT_REGS_PARM3(ctx); diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index c9544a4ce61a..3e36b3e4e3ef 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -3,17 +3,19 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> -#include <linux/bpf.h> #include <string.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define MAX_INDEX 64 #define MAX_STARS 38 +/* my_map, my_hist_map */ +static int map_fd[2]; + static void stars(char *str, long val, long max, int width) { int i; @@ -115,18 +117,39 @@ static void int_exit(int sig) int main(int ac, char **argv) { struct rlimit r = {1024*1024, RLIM_INFINITY}; - char filename[256]; long key, next_key, value; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; + char filename[256]; + int i, j = 0; FILE *f; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK)"); return 1; } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map"); + if (map_fd[0] < 0 || map_fd[1] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + signal(SIGINT, int_exit); signal(SIGTERM, int_exit); @@ -138,9 +161,14 @@ int main(int ac, char **argv) f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); (void) f; - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; i < 5; i++) { @@ -156,5 +184,10 @@ int main(int ac, char **argv) } print_hist(map_fd[1]); +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index fe21c14feb8d..659613c19a82 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -11,12 +11,12 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(u64), - .max_entries = 4096, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, u64); + __uint(max_entries, 4096); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful @@ -42,12 +42,12 @@ static unsigned int log2l(unsigned long long n) #define SLOTS 100 -struct bpf_map_def SEC("maps") lat_map = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u64), - .max_entries = SLOTS, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, SLOTS); +} lat_map SEC(".maps"); SEC("kprobe/blk_account_io_completion") int bpf_prog2(struct pt_regs *ctx) diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index cf8fedc773f2..70e987775c15 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -7,11 +7,10 @@ #include <unistd.h> #include <stdbool.h> #include <string.h> -#include <linux/bpf.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include "bpf_util.h" #define SLOTS 100 @@ -109,20 +108,11 @@ static void print_hist(int fd) int main(int ac, char **argv) { struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (setrlimit(RLIMIT_MEMLOCK, &r)) { - perror("setrlimit(RLIMIT_MEMLOCK)"); - return 1; - } - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } + int map_fd, i, j = 0; for (i = 1; i < ac; i++) { if (strcmp(argv[i], "-a") == 0) { @@ -137,6 +127,40 @@ int main(int ac, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; + } + printf(" heatmap of IO latency\n"); if (text_only) printf(" %s", sym[num_colors - 1]); @@ -153,9 +177,14 @@ int main(int ac, char **argv) for (i = 0; ; i++) { if (i % 20 == 0) print_banner(); - print_hist(map_fd[1]); + print_hist(map_fd); sleep(2); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c index b1bb9df88f8e..eb0f8fdd14bf 100644 --- a/samples/bpf/tracex4_kern.c +++ b/samples/bpf/tracex4_kern.c @@ -15,12 +15,12 @@ struct pair { u64 ip; }; -struct bpf_map_def SEC("maps") my_map = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(long), - .value_size = sizeof(struct pair), - .max_entries = 1000000, -}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, long); + __type(value, struct pair); + __uint(max_entries, 1000000); +} my_map SEC(".maps"); /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index ec52203fce39..e8faf8f184ae 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -8,11 +8,10 @@ #include <stdbool.h> #include <string.h> #include <time.h> -#include <linux/bpf.h> #include <sys/resource.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> struct pair { long long val; @@ -36,8 +35,8 @@ static void print_old_objects(int fd) key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ key = -1; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &v); + while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(fd, &next_key, &v); key = next_key; if (val - v.val < 1000000000ll) /* object was allocated more then 1 sec ago */ @@ -50,25 +49,55 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; - int i; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + int map_fd, i, j = 0; if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); return 1; } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); + if (map_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[j] = bpf_program__attach(prog); + if (libbpf_get_error(links[j])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[j] = NULL; + goto cleanup; + } + j++; } for (i = 0; ; i++) { - print_old_objects(map_fd[1]); + print_old_objects(map_fd); sleep(1); } +cleanup: + for (j--; j >= 0; j--) + bpf_link__destroy(links[j]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index 481790fde864..32b49e8ab6bd 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -15,16 +15,16 @@ #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F -struct bpf_map_def SEC("maps") progs = { - .type = BPF_MAP_TYPE_PROG_ARRAY, - .key_size = sizeof(u32), - .value_size = sizeof(u32), +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); #ifdef __mips__ - .max_entries = 6000, /* MIPS n64 syscalls start at 5000 */ + __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ #else - .max_entries = 1024, + __uint(max_entries, 1024); #endif -}; +} progs SEC(".maps"); SEC("kprobe/__seccomp_filter") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index c2317b39e0d2..98dad57a96c4 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -1,15 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include <stdio.h> -#include <linux/bpf.h> +#include <stdlib.h> #include <unistd.h> #include <linux/filter.h> #include <linux/seccomp.h> #include <sys/prctl.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> #include <sys/resource.h> #include "trace_helpers.h" +#ifdef __mips__ +#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ +#else +#define MAX_ENTRIES 1024 +#endif + /* install fake seccomp program to enable seccomp code path inside the kernel, * so that our kprobe attached to seccomp_phase1() can be triggered */ @@ -28,16 +34,57 @@ static void install_accept_all_seccomp(void) int main(int ac, char **argv) { - FILE *f; - char filename[256]; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; + int key, fd, progs_fd; + char filename[256]; + const char *title; + FILE *f; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + printf("finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; + } + + progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); + if (progs_fd < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + title = bpf_program__title(prog, false); + /* register only syscalls to PROG_ARRAY */ + if (sscanf(title, "kprobe/%d", &key) != 1) + continue; + + fd = bpf_program__fd(prog); + bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); } install_accept_all_seccomp(); @@ -47,5 +94,8 @@ int main(int ac, char **argv) read_trace_pipe(); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index 96c234efa852..acad5712d8b4 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -3,24 +3,26 @@ #include <uapi/linux/bpf.h> #include <bpf/bpf_helpers.h> -struct bpf_map_def SEC("maps") counters = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(u64), - .max_entries = 64, -}; -struct bpf_map_def SEC("maps") values2 = { - .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(struct bpf_perf_event_value), - .max_entries = 64, -}; +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(u32)); + __uint(max_entries, 64); +} counters SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, u64); + __uint(max_entries, 64); +} values SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct bpf_perf_event_value); + __uint(max_entries, 64); +} values2 SEC(".maps"); SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index 4bb3c830adb2..33df9784775d 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -4,7 +4,6 @@ #include <assert.h> #include <fcntl.h> #include <linux/perf_event.h> -#include <linux/bpf.h> #include <sched.h> #include <stdio.h> #include <stdlib.h> @@ -15,12 +14,15 @@ #include <sys/wait.h> #include <unistd.h> -#include "bpf_load.h" #include <bpf/bpf.h> +#include <bpf/libbpf.h> #include "perf-sys.h" #define SAMPLE_PERIOD 0x7fffffffffffffffULL +/* counters, values, values2 */ +static int map_fd[3]; + static void check_on_cpu(int cpu, struct perf_event_attr *attr) { struct bpf_perf_event_value value2; @@ -174,16 +176,51 @@ static void test_bpf_perf_event(void) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_link *links[2]; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; + int i = 0; + + setrlimit(RLIMIT_MEMLOCK, &r); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } - setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } + + map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); + map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); + map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); + if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { + fprintf(stderr, "ERROR: finding a map in obj file failed\n"); + goto cleanup; + } + + bpf_object__for_each_program(prog, obj) { + links[i] = bpf_program__attach(prog); + if (libbpf_get_error(links[i])) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + links[i] = NULL; + goto cleanup; + } + i++; } test_bpf_perf_event(); + +cleanup: + for (i--; i >= 0; i--) + bpf_link__destroy(links[i]); + + bpf_object__close(obj); return 0; } diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c index ea6dae78f0df..fdcd6580dd73 100644 --- a/samples/bpf/tracex7_user.c +++ b/samples/bpf/tracex7_user.c @@ -1,28 +1,51 @@ #define _GNU_SOURCE #include <stdio.h> -#include <linux/bpf.h> #include <unistd.h> -#include <bpf/bpf.h> -#include "bpf_load.h" +#include <bpf/libbpf.h> int main(int argc, char **argv) { - FILE *f; + struct bpf_link *link = NULL; + struct bpf_program *prog; + struct bpf_object *obj; char filename[256]; char command[256]; - int ret; + int ret = 0; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) { + fprintf(stderr, "ERROR: opening BPF object file failed\n"); + return 0; + } + + prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); + if (!prog) { + fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); + goto cleanup; + } + + /* load BPF program */ + if (bpf_object__load(obj)) { + fprintf(stderr, "ERROR: loading BPF object file failed\n"); + goto cleanup; + } - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; + link = bpf_program__attach(prog); + if (libbpf_get_error(link)) { + fprintf(stderr, "ERROR: bpf_program__attach failed\n"); + link = NULL; + goto cleanup; } snprintf(command, 256, "mount %s tmpmnt/", argv[1]); f = popen(command, "r"); ret = pclose(f); +cleanup: + bpf_link__destroy(link); + bpf_object__close(obj); return ret ? 0 : 1; } diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 9b8f21abeac4..f3468168982e 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -19,9 +19,6 @@ static const char *__doc__ = #include <time.h> #include <linux/limits.h> -#define __must_check -#include <linux/err.h> - #include <arpa/inet.h> #include <linux/if_link.h> @@ -622,7 +619,7 @@ static struct bpf_link * attach_tp(struct bpf_object *obj, } link = bpf_program__attach_tracepoint(prog, tp_category, tp_name); - if (IS_ERR(link)) + if (libbpf_get_error(link)) exit(EXIT_FAIL_BPF); return link; diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index e4d9da654e84..a226aee3574f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -29,8 +29,8 @@ CGROUP COMMANDS | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } | *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** | | **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** | -| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | -| **getsockopt** | **setsockopt** } +| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** | +| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** } | *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION @@ -101,7 +101,11 @@ DESCRIPTION an unconnected udp6 socket (since 5.2); **sysctl** sysctl access (since 5.2); **getsockopt** call to getsockopt (since 5.3); - **setsockopt** call to setsockopt (since 5.3). + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* Detach *PROG* from the cgroup *CGROUP* and attach type diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 5948e9d89c8d..2b254959d488 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -41,7 +41,8 @@ PROG COMMANDS | **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | | **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | | **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | -| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | | **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | | **cgroup/getsockopt** | **cgroup/setsockopt** | | **struct_ops** | **fentry** | **fexit** | **freplace** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 9f0f20e73b87..25b25aca1112 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -472,6 +472,8 @@ _bpftool() lwt_seg6local sockops sk_skb sk_msg \ lirc_mode2 cgroup/bind4 cgroup/bind6 \ cgroup/connect4 cgroup/connect6 \ + cgroup/getpeername4 cgroup/getpeername6 \ + cgroup/getsockname4 cgroup/getsockname6 \ cgroup/sendmsg4 cgroup/sendmsg6 \ cgroup/recvmsg4 cgroup/recvmsg6 \ cgroup/post_bind4 cgroup/post_bind6 \ @@ -966,9 +968,10 @@ _bpftool() ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ - device bind4 bind6 post_bind4 post_bind6 connect4 \ - connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \ - getsockopt setsockopt' + device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \ + getpeername4 getpeername6 getsockname4 getsockname6 \ + sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \ + setsockopt' local ATTACH_FLAGS='multi override' local PROG_TYPE='id pinned tag name' case $prev in @@ -977,9 +980,9 @@ _bpftool() return 0 ;; ingress|egress|sock_create|sock_ops|device|bind4|bind6|\ - post_bind4|post_bind6|connect4|connect6|sendmsg4|\ - sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\ - setsockopt) + post_bind4|post_bind6|connect4|connect6|getpeername4|\ + getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\ + recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt) COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c index 1693c802bb20..27931db421d8 100644 --- a/tools/bpf/bpftool/cgroup.c +++ b/tools/bpf/bpftool/cgroup.c @@ -25,9 +25,10 @@ " ATTACH_TYPE := { ingress | egress | sock_create |\n" \ " sock_ops | device | bind4 | bind6 |\n" \ " post_bind4 | post_bind6 | connect4 |\n" \ - " connect6 | sendmsg4 | sendmsg6 |\n" \ - " recvmsg4 | recvmsg6 | sysctl |\n" \ - " getsockopt | setsockopt }" + " connect6 | getpeername4 | getpeername6 |\n" \ + " getsockname4 | getsockname6 | sendmsg4 |\n" \ + " sendmsg6 | recvmsg4 | recvmsg6 |\n" \ + " sysctl | getsockopt | setsockopt }" static unsigned int query_flags; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index f89ac70ef973..5cdf0bc049bd 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_CGROUP_INET6_CONNECT] = "connect6", [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4", + [BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6", + [BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4", + [BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6", [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", [BPF_CGROUP_SYSCTL] = "sysctl", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b6e5ba568f98..245f941fdbcf 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv) " sk_reuseport | flow_dissector | cgroup/sysctl |\n" " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" - " cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n" - " cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n" + " cgroup/getpeername4 | cgroup/getpeername6 |\n" + " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n" + " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" + " cgroup/getsockopt | cgroup/setsockopt |\n" " struct_ops | fentry | fexit | freplace }\n" " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..97e1fd19ff58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; @@ -2015,8 +2019,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/tools/lib/bpf/hashmap.c b/tools/lib/bpf/hashmap.c index cffb96202e0d..a405dad068f5 100644 --- a/tools/lib/bpf/hashmap.c +++ b/tools/lib/bpf/hashmap.c @@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, void hashmap__clear(struct hashmap *map) { struct hashmap_entry *cur, *tmp; - int bkt; + size_t bkt; hashmap__for_each_entry_safe(map, cur, tmp, bkt) { free(cur); @@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map) struct hashmap_entry **new_buckets; struct hashmap_entry *cur, *tmp; size_t new_cap_bits, new_cap; - size_t h; - int bkt; + size_t h, bkt; new_cap_bits = map->cap_bits + 1; if (new_cap_bits < HASHMAP_MIN_CAP_BITS) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index bae8879cdf58..e823b35e7371 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -15,7 +15,6 @@ #else #include <bits/reg.h> #endif -#include "libbpf_internal.h" static inline size_t hash_bits(size_t h, int bits) { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 292257995487..fa04cbe547ed 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_CGROUP_UDP4_RECVMSG), BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG), + BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETPEERNAME), + BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET4_GETSOCKNAME), + BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_CGROUP_INET6_GETSOCKNAME), BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_CGROUP_SYSCTL), BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT, diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 0f67f1b470b0..e885d351595f 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -1,6 +1,8 @@ ================== BPF Selftest Notes ================== +General instructions on running selftests can be found in +`Documentation/bpf/bpf_devel_QA.rst`_. Additional information about selftest failures are documented here. diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..2118e23ac07a 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y CONFIG_FTRACE_SYSCALLS=y CONFIG_IPV6_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SEG6_BPF=y CONFIG_NET_FOU=m CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_IPV6_FOU=m @@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_LIRC=y diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 999a775484c1..e36dd1a1780d 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -5,6 +5,8 @@ #include <string.h> #include <unistd.h> +#include <arpa/inet.h> + #include <sys/epoll.h> #include <linux/err.h> @@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = { .tcp.doff = 5, }; -int start_server(int family, int type) +int start_server_with_port(int family, int type, __u16 port) { struct sockaddr_storage addr = {}; socklen_t len; @@ -45,11 +47,13 @@ int start_server(int family, int type) struct sockaddr_in *sin = (void *)&addr; sin->sin_family = AF_INET; + sin->sin_port = htons(port); len = sizeof(*sin); } else { struct sockaddr_in6 *sin6 = (void *)&addr; sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); len = sizeof(*sin6); } @@ -76,6 +80,11 @@ int start_server(int family, int type) return fd; } +int start_server(int family, int type) +{ + return start_server_with_port(family, type, 0); +} + static const struct timeval timeo_sec = { .tv_sec = 3 }; static const size_t timeo_optlen = sizeof(timeo_sec); diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 86914e6e7b53..6a8009605670 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -34,6 +34,7 @@ struct ipv6_packet { extern struct ipv6_packet pkt_v6; int start_server(int family, int type); +int start_server_with_port(int family, int type, __u16 port); int connect_to_fd(int family, int type, int server_fd); int connect_fd_to_fd(int client_fd, int server_fd); int connect_wait(int client_fd); diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 0262f7b374f9..c548aded6585 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -1,24 +1,5 @@ -#include <asm/types.h> -#include <linux/types.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <stddef.h> -#include <stdbool.h> - -#include <linux/unistd.h> -#include <linux/filter.h> -#include <linux/bpf_perf_event.h> -#include <linux/bpf.h> - -#include <bpf/bpf.h> - -#include "../../../include/linux/filter.h" -#include "bpf_rlimit.h" -#include "bpf_util.h" +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> #define MAX_INSNS 512 #define MAX_MATCHES 16 @@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = { * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, - {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"}, + {33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"}, }, }, { @@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ @@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = { /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"}, + {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, }, }, { @@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = { .matches = { {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, + {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, + {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, } }, { @@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = { /* New unknown value in R7 is (4n) */ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"}, + {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = { * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"}, + }, }, { @@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = { /* Adding 14 makes R6 be (4n+2) */ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"}, + {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so * the total offset is 4-byte aligned and meets the * load's requirements. */ - {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"}, + {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, }, }, }; @@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test) return ret; } -static int do_test(unsigned int from, unsigned int to) +void test_align(void) { - int all_pass = 0; - int all_fail = 0; unsigned int i; - for (i = from; i < to; i++) { + for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_align_test *test = &tests[i]; - int fail; - printf("Test %3d: %s ... ", - i, test->descr); - fail = do_test_single(test); - if (fail) { - all_fail++; - printf("FAIL\n"); - } else { - all_pass++; - printf("PASS\n"); - } - } - printf("Results: %d pass %d fail\n", - all_pass, all_fail); - return all_fail ? EXIT_FAILURE : EXIT_SUCCESS; -} + if (!test__start_subtest(test->descr)) + continue; -int main(int argc, char **argv) -{ - unsigned int from = 0, to = ARRAY_SIZE(tests); - - if (argc == 3) { - unsigned int l = atoi(argv[argc - 2]); - unsigned int u = atoi(argv[argc - 1]); - - if (l < to && u < to) { - from = l; - to = u + 1; - } - } else if (argc == 2) { - unsigned int t = atoi(argv[argc - 1]); - - if (t < to) { - from = t; - to = t + 1; - } + CHECK_FAIL(do_test_single(test)); } - return do_test(from, to); } diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c index 47fbb20cb6a6..17bbf76812ca 100644 --- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -4,7 +4,8 @@ #include "cgroup_helpers.h" #include "network_helpers.h" -static int verify_port(int family, int fd, int expected) +static int verify_ports(int family, int fd, + __u16 expected_local, __u16 expected_peer) { struct sockaddr_storage addr; socklen_t len = sizeof(addr); @@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected) else port = ((struct sockaddr_in6 *)&addr)->sin6_port; - if (ntohs(port) != expected) { - log_err("Unexpected port %d, expected %d", ntohs(port), - expected); + if (ntohs(port) != expected_local) { + log_err("Unexpected local port %d, expected %d", ntohs(port), + expected_local); + return -1; + } + + if (getpeername(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get peer addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected_peer) { + log_err("Unexpected peer port %d, expected %d", ntohs(port), + expected_peer); return -1; } @@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected) static int run_test(int cgroup_fd, int server_fd, int family, int type) { + bool v4 = family == AF_INET; + __u16 expected_local_port = v4 ? 22222 : 22223; + __u16 expected_peer_port = 60000; struct bpf_prog_load_attr attr = { - .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + .file = v4 ? "./connect_force_port4.o" : + "./connect_force_port6.o", }; + struct bpf_program *prog; struct bpf_object *obj; - int expected_port; - int prog_fd; - int err; - int fd; - - if (family == AF_INET) { - attr.file = "./connect_force_port4.o"; - attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; - expected_port = 22222; - } else { - attr.file = "./connect_force_port6.o"; - attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; - expected_port = 22223; - } + int xlate_fd, fd, err; + __u32 duration = 0; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); if (err) { log_err("Failed to load BPF object"); return -1; } - err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, - 0); + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/connect4" : + "cgroup/connect6"); + if (CHECK(!prog, "find_prog", "connect prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_CONNECT : + BPF_CGROUP_INET6_CONNECT, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getpeername4" : + "cgroup/getpeername6"); + if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET6_GETPEERNAME, 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + prog = bpf_object__find_program_by_title(obj, v4 ? + "cgroup/getsockname4" : + "cgroup/getsockname6"); + if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) { + err = -EIO; + goto close_bpf_object; + } + + err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ? + BPF_CGROUP_INET4_GETSOCKNAME : + BPF_CGROUP_INET6_GETSOCKNAME, 0); if (err) { log_err("Failed to attach BPF program"); goto close_bpf_object; @@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type) goto close_bpf_object; } - err = verify_port(family, fd, expected_port); - + err = verify_ports(family, fd, expected_local_port, + expected_peer_port); close(fd); close_bpf_object: @@ -86,25 +137,25 @@ void test_connect_force_port(void) if (CHECK_FAIL(cgroup_fd < 0)) return; - server_fd = start_server(AF_INET, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_STREAM); + server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); close(server_fd); - server_fd = start_server(AF_INET, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); close(server_fd); - server_fd = start_server(AF_INET6, SOCK_DGRAM); + server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124); if (CHECK_FAIL(server_fd < 0)) goto close_cgroup_fd; CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 4867cd3445c8..b57bd6fef208 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c index ab9e2650e021..c8e9ca74c87b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c @@ -1,9 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__ipv6_route #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__ipv6_route { + struct bpf_iter_meta *meta; + struct fib6_info *rt; +} __attribute__((preserve_access_index)); + char _license[] SEC("license") = "GPL"; extern bool CONFIG_IPV6_SUBTREES __kconfig __weak; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c index 6b40a233d4e0..e7b8753eac0b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -1,6 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__netlink bpf_iter__netlink___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__netlink #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> @@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL"; #define sk_rmem_alloc sk_backlog.rmem_alloc #define sk_refcnt __sk_common.skc_refcnt +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__netlink { + struct bpf_iter_meta *meta; + struct netlink_sock *sk; +} __attribute__((preserve_access_index)); + static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c index 90f9011c57ca..ee754021f98e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c @@ -1,11 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index c6ced38f0880..0f0ec3db20ba 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -1,11 +1,29 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task_file bpf_iter__task_file___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task_file #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task_file { + struct bpf_iter_meta *meta; + struct task_struct *task; + __u32 fd; + struct file *file; +} __attribute__((preserve_access_index)); + SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c index 636a00fa074d..13c2c90c835f 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c index b18dc0471d07..0aa71b333cf3 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c @@ -1,10 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__bpf_map #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +} __attribute__((preserve_access_index)); + __u32 map1_id = 0, map2_id = 0; __u32 map1_accessed = 0, map2_accessed = 0; __u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h index bdd51cf14b54..dee1339e6905 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h @@ -1,11 +1,27 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2020 Facebook */ +/* "undefine" structs in vmlinux.h, because we "override" them below */ +#define bpf_iter_meta bpf_iter_meta___not_used +#define bpf_iter__task bpf_iter__task___not_used #include "vmlinux.h" +#undef bpf_iter_meta +#undef bpf_iter__task #include <bpf/bpf_helpers.h> char _license[] SEC("license") = "GPL"; int count = 0; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +} __attribute__((preserve_access_index)); + +struct bpf_iter__task { + struct bpf_iter_meta *meta; + struct task_struct *task; +} __attribute__((preserve_access_index)); + SEC("iter/task") int dump_task(struct bpf_iter__task *ctx) { diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c index 1b8eb34b2db0..7396308677a3 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port4.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <string.h> +#include <stdbool.h> #include <linux/bpf.h> #include <linux/in.h> @@ -12,17 +13,71 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect4") -int _connect4(struct bpf_sock_addr *ctx) +int connect4(struct bpf_sock_addr *ctx) { struct sockaddr_in sa = {}; + struct svc_addr *orig; + /* Force local address to 127.0.0.1:22222. */ sa.sin_family = AF_INET; sa.sin_port = bpf_htons(22222); - sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr = ctx->user_ip4; + orig->port = ctx->user_port; + + ctx->user_ip4 = bpf_htonl(0x7f000001); + ctx->user_port = bpf_htons(60123); + } + return 1; +} + +SEC("cgroup/getsockname4") +int getsockname4(struct bpf_sock_addr *ctx) +{ + /* Expose local server as 1.2.3.4:60000 to client. */ + if (ctx->user_port == bpf_htons(60123)) { + ctx->user_ip4 = bpf_htonl(0x01020304); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername4") +int getpeername4(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service 1.2.3.4:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60123)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip4 = orig->addr; + ctx->user_port = orig->port; + } + } return 1; } diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c index ae6f7d750b4c..c1a2b555e9ad 100644 --- a/tools/testing/selftests/bpf/progs/connect_force_port6.c +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -12,17 +12,83 @@ char _license[] SEC("license") = "GPL"; int _version SEC("version") = 1; +struct svc_addr { + __be32 addr[4]; + __be16 port; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct svc_addr); +} service_mapping SEC(".maps"); + SEC("cgroup/connect6") -int _connect6(struct bpf_sock_addr *ctx) +int connect6(struct bpf_sock_addr *ctx) { struct sockaddr_in6 sa = {}; + struct svc_addr *orig; + /* Force local address to [::1]:22223. */ sa.sin6_family = AF_INET6; sa.sin6_port = bpf_htons(22223); - sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) return 0; + /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */ + if (ctx->user_port == bpf_htons(60000)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!orig) + return 0; + + orig->addr[0] = ctx->user_ip6[0]; + orig->addr[1] = ctx->user_ip6[1]; + orig->addr[2] = ctx->user_ip6[2]; + orig->addr[3] = ctx->user_ip6[3]; + orig->port = ctx->user_port; + + ctx->user_ip6[0] = 0; + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60124); + } + return 1; +} + +SEC("cgroup/getsockname6") +int getsockname6(struct bpf_sock_addr *ctx) +{ + /* Expose local server as [fc00::1]:60000 to client. */ + if (ctx->user_port == bpf_htons(60124)) { + ctx->user_ip6[0] = bpf_htonl(0xfc000000); + ctx->user_ip6[1] = 0; + ctx->user_ip6[2] = 0; + ctx->user_ip6[3] = bpf_htonl(1); + ctx->user_port = bpf_htons(60000); + } + return 1; +} + +SEC("cgroup/getpeername6") +int getpeername6(struct bpf_sock_addr *ctx) +{ + struct svc_addr *orig; + + /* Expose service [fc00::1]:60000 as peer instead of backend. */ + if (ctx->user_port == bpf_htons(60124)) { + orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0); + if (orig) { + ctx->user_ip6[0] = orig->addr[0]; + ctx->user_ip6[1] = orig->addr[1]; + ctx->user_ip6[2] = orig->addr[2]; + ctx->user_ip6[3] = orig->addr[3]; + ctx->user_port = orig->port; + } + } return 1; } diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c index d2b38fa6a5b0..e83d0b48d80c 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c @@ -73,6 +73,7 @@ int bpf_sk_lookup_test0(struct __sk_buff *skb) tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + bpf_printk("sk=%d\n", sk ? 1 : 0); if (sk) bpf_sk_release(sk); return sk ? TC_ACT_OK : TC_ACT_UNSPEC; diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 9b4d3a68a91a..a443d3637db3 100644 --- a/tools/testing/selftests/bpf/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -110,8 +110,6 @@ int bpf_prog2(struct __sk_buff *skb) flags = *f; } - bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", - len, flags); #ifdef SOCKMAP return bpf_sk_redirect_map(skb, &sock_map, ret, flags); #else @@ -143,8 +141,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("passive(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: @@ -160,8 +156,6 @@ int bpf_sockmap(struct bpf_sock_ops *skops) err = bpf_sock_hash_update(skops, &sock_map, &ret, BPF_NOEXIST); #endif - bpf_printk("active(%i -> %i) map ctx update err: %d\n", - lport, bpf_ntohl(rport), err); } break; default: @@ -199,72 +193,6 @@ int bpf_prog4(struct sk_msg_md *msg) } SEC("sk_msg2") -int bpf_prog5(struct sk_msg_md *msg) -{ - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int *start, *end, *start_push, *end_push, *start_pop, *pop; - int *bytes, len1, len2 = 0, len3, len4; - int err1 = -1, err2 = -1; - - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - int err; - - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - int err; - - bpf_printk("sk_msg2: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg2: push_data err %i\n", err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg2: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg2: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", - len1, err1, err2); - return SK_PASS; -} - -SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; @@ -305,86 +233,7 @@ int bpf_prog6(struct sk_msg_md *msg) #endif } -SEC("sk_msg4") -int bpf_prog7(struct sk_msg_md *msg) -{ - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f; - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; - int len1, len2 = 0, len3, len4; - int err1 = 0, err2 = 0, key = 0; - __u64 flags = 0; - - int err; - bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); - if (bytes) - err1 = bpf_msg_apply_bytes(msg, *bytes); - bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); - if (bytes) - err2 = bpf_msg_cork_bytes(msg, *bytes); - len1 = (__u64)msg->data_end - (__u64)msg->data; - - start = bpf_map_lookup_elem(&sock_bytes, &zero); - end = bpf_map_lookup_elem(&sock_bytes, &one); - if (start && end) { - bpf_printk("sk_msg2: pull(%i:%i)\n", - start ? *start : 0, end ? *end : 0); - err = bpf_msg_pull_data(msg, *start, *end, 0); - if (err) - bpf_printk("sk_msg2: pull_data err %i\n", - err); - len2 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg2: length update %i->%i\n", - len1, len2); - } - - start_push = bpf_map_lookup_elem(&sock_bytes, &two); - end_push = bpf_map_lookup_elem(&sock_bytes, &three); - if (start_push && end_push) { - bpf_printk("sk_msg4: push(%i:%i)\n", - start_push ? *start_push : 0, - end_push ? *end_push : 0); - err = bpf_msg_push_data(msg, *start_push, *end_push, 0); - if (err) - bpf_printk("sk_msg4: push_data err %i\n", - err); - len3 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length push_update %i->%i\n", - len2 ? len2 : len1, len3); - } - - start_pop = bpf_map_lookup_elem(&sock_bytes, &four); - pop = bpf_map_lookup_elem(&sock_bytes, &five); - if (start_pop && pop) { - int err; - - bpf_printk("sk_msg4: pop(%i@%i)\n", - start_pop, pop); - err = bpf_msg_pop_data(msg, *start_pop, *pop, 0); - if (err) - bpf_printk("sk_msg4: pop_data err %i\n", err); - len4 = (__u64)msg->data_end - (__u64)msg->data; - bpf_printk("sk_msg4: length pop_data %i->%i\n", - len1 ? len1 : 0, len4); - } - - - f = bpf_map_lookup_elem(&sock_redir_flags, &zero); - if (f && *f) { - key = 2; - flags = *f; - } - bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", - len1, flags, err1 ? err1 : err2); -#ifdef SOCKMAP - err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); -#else - err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags); -#endif - bpf_printk("sk_msg3: err %i\n", err); - return err; -} - -SEC("sk_msg5") +SEC("sk_msg3") int bpf_prog8(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -401,7 +250,7 @@ int bpf_prog8(struct sk_msg_md *msg) } return SK_PASS; } -SEC("sk_msg6") +SEC("sk_msg4") int bpf_prog9(struct sk_msg_md *msg) { void *data_end = (void *)(long) msg->data_end; @@ -419,7 +268,7 @@ int bpf_prog9(struct sk_msg_md *msg) return SK_PASS; } -SEC("sk_msg7") +SEC("sk_msg5") int bpf_prog10(struct sk_msg_md *msg) { int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; @@ -443,7 +292,6 @@ int bpf_prog10(struct sk_msg_md *msg) pop = bpf_map_lookup_elem(&sock_bytes, &five); if (start_pop && pop) bpf_msg_pop_data(msg, *start_pop, *pop, 0); - bpf_printk("return sk drop\n"); return SK_DROP; } diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 779e11da979c..c80643828b82 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -54,7 +54,7 @@ static void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o" #define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o" #define CG_PATH "/sockmap" @@ -68,9 +68,7 @@ struct bpf_map *maps[8]; int prog_fd[11]; int txmsg_pass; -int txmsg_noisy; int txmsg_redir; -int txmsg_redir_noisy; int txmsg_drop; int txmsg_apply; int txmsg_cork; @@ -89,15 +87,13 @@ static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, {"rate", required_argument, NULL, 'r' }, - {"verbose", no_argument, NULL, 'v' }, + {"verbose", optional_argument, NULL, 'v' }, {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, {"data_test", no_argument, NULL, 'd' }, {"txmsg", no_argument, &txmsg_pass, 1 }, - {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, {"txmsg_redir", no_argument, &txmsg_redir, 1 }, - {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, {"txmsg_drop", no_argument, &txmsg_drop, 1 }, {"txmsg_apply", required_argument, NULL, 'a'}, {"txmsg_cork", required_argument, NULL, 'k'}, @@ -111,9 +107,104 @@ static const struct option long_options[] = { {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"whitelist", required_argument, NULL, 'n' }, + {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } }; +struct test_env { + const char *type; + const char *subtest; + const char *prepend; + + int test_num; + int subtest_num; + + int succ_cnt; + int fail_cnt; + int fail_last; +}; + +struct test_env env; + +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; + int iov_count; + int iov_length; + int rate; + char *map; + char *whitelist; + char *blacklist; + char *prepend; +}; + +struct _test { + char *title; + void (*tester)(int cg_fd, struct sockmap_options *opt); +}; + +static void test_start(void) +{ + env.subtest_num++; +} + +static void test_fail(void) +{ + env.fail_cnt++; +} + +static void test_pass(void) +{ + env.succ_cnt++; +} + +static void test_reset(void) +{ + txmsg_start = txmsg_end = 0; + txmsg_start_pop = txmsg_pop = 0; + txmsg_start_push = txmsg_end_push = 0; + txmsg_pass = txmsg_drop = txmsg_redir = 0; + txmsg_apply = txmsg_cork = 0; + txmsg_ingress = txmsg_skb = 0; +} + +static int test_start_subtest(const struct _test *t, struct sockmap_options *o) +{ + env.type = o->map; + env.subtest = t->title; + env.prepend = o->prepend; + env.test_num++; + env.subtest_num = 0; + env.fail_last = env.fail_cnt; + test_reset(); + return 0; +} + +static void test_end_subtest(void) +{ + int error = env.fail_cnt - env.fail_last; + int type = strcmp(env.type, BPF_SOCKMAP_FILENAME); + + if (!error) + test_pass(); + + fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n", + env.test_num, env.subtest_num, + !type ? "sockmap" : "sockhash", + env.prepend ? : "", + env.subtest, error ? "FAIL" : "OK"); +} + +static void test_print_results(void) +{ + fprintf(stdout, "Pass: %d Fail: %d\n", + env.succ_cnt, env.fail_cnt); +} + static void usage(char *argv[]) { int i; @@ -296,7 +387,7 @@ static int sockmap_init_sockets(int verbose) return errno; } - if (verbose) { + if (verbose > 1) { printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); @@ -311,17 +402,6 @@ struct msg_stats { struct timespec end; }; -struct sockmap_options { - int verbose; - bool base; - bool sendpage; - bool data_test; - bool drop_expected; - int iov_count; - int iov_length; - int rate; -}; - static int msg_loop_sendpage(int fd, int iov_length, int cnt, struct msg_stats *s, struct sockmap_options *opt) @@ -345,14 +425,18 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendfile(fd, fp, NULL, iov_length); + int sent; + + errno = 0; + sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendpage loop error"); fclose(file); return sent; } else if (drop && sent >= 0) { - printf("sendpage loop error expected: %i\n", sent); + printf("sendpage loop error expected: %i errno %i\n", + sent, errno); fclose(file); return -EIO; } @@ -464,13 +548,18 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { - int sent = sendmsg(fd, &msg, flags); + int sent; + + errno = 0; + sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error"); + perror("sendmsg loop error"); goto out_errno; } else if (drop && sent >= 0) { - printf("send loop error expected: %i\n", sent); + fprintf(stderr, + "sendmsg loop error expected: %i errno %i\n", + sent, errno); errno = -EIO; goto out_errno; } @@ -497,9 +586,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, * paths. */ total_bytes = (float)iov_count * (float)iov_length * (float)cnt; - txmsg_pop_total = txmsg_pop; if (txmsg_apply) - txmsg_pop_total *= (total_bytes / txmsg_apply); + txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply); + else + txmsg_pop_total = txmsg_pop * cnt; total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) @@ -633,14 +723,18 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { + iov_buf -= (txmsg_pop - txmsg_start_pop + 1); if (opt->drop_expected) - exit(0); + _exit(0); + + if (!iov_buf) /* zero bytes sent case */ + _exit(0); if (opt->sendpage) iov_count = 1; err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false, opt); - if (opt->verbose) + if (opt->verbose > 1) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", iov_count, iov_buf, cnt, err); @@ -648,7 +742,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -678,7 +772,7 @@ static int sendmsg_test(struct sockmap_options *opt) sent_Bps = sentBps(s); recvd_Bps = recvdBps(s); } - if (opt->verbose) + if (opt->verbose > 1) fprintf(stdout, "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, @@ -694,14 +788,14 @@ static int sendmsg_test(struct sockmap_options *opt) if (WIFEXITED(rx_status)) { err = WEXITSTATUS(rx_status); if (err) { - fprintf(stderr, "rx thread exited with err %d. ", err); + fprintf(stderr, "rx thread exited with err %d.\n", err); goto out; } } if (WIFEXITED(tx_status)) { err = WEXITSTATUS(tx_status); if (err) - fprintf(stderr, "tx thread exited with err %d. ", err); + fprintf(stderr, "tx thread exited with err %d.\n", err); } out: return err; @@ -783,6 +877,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) } enum { + SELFTESTS, PING_PONG, SENDMSG, BASE, @@ -834,19 +929,14 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) tx_prog_fd = prog_fd[3]; - else if (txmsg_noisy) - tx_prog_fd = prog_fd[4]; else if (txmsg_redir) + tx_prog_fd = prog_fd[4]; + else if (txmsg_apply) tx_prog_fd = prog_fd[5]; - else if (txmsg_redir_noisy) + else if (txmsg_cork) tx_prog_fd = prog_fd[6]; else if (txmsg_drop) - tx_prog_fd = prog_fd[9]; - /* apply and cork must be last */ - else if (txmsg_apply) tx_prog_fd = prog_fd[7]; - else if (txmsg_cork) - tx_prog_fd = prog_fd[8]; else tx_prog_fd = 0; @@ -870,7 +960,7 @@ run: goto out; } - if (txmsg_redir || txmsg_redir_noisy) + if (txmsg_redir) redir_fd = c2; else redir_fd = c1; @@ -1112,12 +1202,8 @@ static void test_options(char *options) if (txmsg_pass) strncat(options, "pass,", OPTSTRING); - if (txmsg_noisy) - strncat(options, "pass_noisy,", OPTSTRING); if (txmsg_redir) strncat(options, "redir,", OPTSTRING); - if (txmsg_redir_noisy) - strncat(options, "redir_noisy,", OPTSTRING); if (txmsg_drop) strncat(options, "drop,", OPTSTRING); if (txmsg_apply) { @@ -1168,416 +1254,283 @@ static int __test_exec(int cgrp, int test, struct sockmap_options *opt) test_options(options); - fprintf(stdout, - "[TEST %i]: (%i, %i, %i, %s, %s): ", - test_cnt, opt->rate, opt->iov_count, opt->iov_length, - test_to_str(test), options); - fflush(stdout); + if (opt->verbose) { + fprintf(stdout, + " [TEST %i]: (%i, %i, %i, %s, %s): ", + test_cnt, opt->rate, opt->iov_count, opt->iov_length, + test_to_str(test), options); + fflush(stdout); + } err = run_options(opt, cgrp, test); - fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED"); + if (opt->verbose) + fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED"); test_cnt++; !err ? passed++ : failed++; free(options); return err; } -static int test_exec(int cgrp, struct sockmap_options *opt) -{ - int err = __test_exec(cgrp, SENDMSG, opt); - - if (err) - goto out; - - err = __test_exec(cgrp, SENDPAGE, opt); -out: - return err; -} - -static int test_loop(int cgrp) -{ - struct sockmap_options opt; - - int err, i, l, r; - - opt.verbose = 0; - opt.base = false; - opt.sendpage = false; - opt.data_test = false; - opt.drop_expected = false; - opt.iov_count = 0; - opt.iov_length = 0; - opt.rate = 0; - - r = 1; - for (i = 1; i < 100; i += 33) { - for (l = 1; l < 100; l += 33) { - opt.rate = r; - opt.iov_count = i; - opt.iov_length = l; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - } - sched_yield(); -out: - return err; -} - -static int test_txmsg(int cgrp) +static void test_exec(int cgrp, struct sockmap_options *opt) { + int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME); int err; - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_ingress = txmsg_skb = 0; - - txmsg_pass = 1; - err = test_loop(cgrp); - txmsg_pass = 0; - if (err) - goto out; - - txmsg_redir = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - if (err) - goto out; - - txmsg_drop = 1; - err = test_loop(cgrp); - txmsg_drop = 0; - if (err) - goto out; - - txmsg_redir = 1; - txmsg_ingress = 1; - err = test_loop(cgrp); - txmsg_redir = 0; - txmsg_ingress = 0; - if (err) - goto out; -out: - txmsg_pass = 0; - txmsg_redir = 0; - txmsg_drop = 0; - return err; + if (type == 0) { + test_start(); + err = __test_exec(cgrp, SENDMSG, opt); + if (err) + test_fail(); + } else { + test_start(); + err = __test_exec(cgrp, SENDPAGE, opt); + if (err) + test_fail(); + } } -static int test_send(struct sockmap_options *opt, int cgrp) +static void test_send_one(struct sockmap_options *opt, int cgrp) { - int err; - opt->iov_length = 1; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1; opt->iov_count = 1024; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->iov_length = 1024; opt->iov_count = 1; opt->rate = 1; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); - opt->iov_length = 1; +} + +static void test_send_many(struct sockmap_options *opt, int cgrp) +{ + opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; - err = test_exec(cgrp, opt); - if (err) - goto out; - - opt->iov_length = 256; - opt->iov_count = 1024; - opt->rate = 2; - err = test_exec(cgrp, opt); - if (err) - goto out; + test_exec(cgrp, opt); opt->rate = 100; opt->iov_count = 1; opt->iov_length = 5; - err = test_exec(cgrp, opt); - if (err) - goto out; -out: - sched_yield(); - return err; + test_exec(cgrp, opt); } -static int test_mixed(int cgrp) +static void test_send_large(struct sockmap_options *opt, int cgrp) { - struct sockmap_options opt = {0}; - int err; + opt->iov_length = 256; + opt->iov_count = 1024; + opt->rate = 2; + test_exec(cgrp, opt); +} - txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0; - txmsg_apply = txmsg_cork = 0; - txmsg_start = txmsg_end = 0; - txmsg_start_push = txmsg_end_push = 0; - txmsg_start_pop = txmsg_pop = 0; +static void test_send(struct sockmap_options *opt, int cgrp) +{ + test_send_one(opt, cgrp); + test_send_many(opt, cgrp); + test_send_large(opt, cgrp); + sched_yield(); +} +static void test_txmsg_pass(int cgrp, struct sockmap_options *opt) +{ /* Test small and large iov_count values with pass/redir/apply/cork */ txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_redir = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_drop(int cgrp, struct sockmap_options *opt) +{ + txmsg_drop = 1; + test_send(opt, cgrp); +} - txmsg_pass = 1; - txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = txmsg_drop = 0; + txmsg_ingress = txmsg_redir = 1; + test_send(opt, cgrp); +} +/* Test cork with hung data. This tests poor usage patterns where + * cork can leave data on the ring if user program is buggy and + * doesn't flush them somehow. They do take some time however + * because they wait for a timeout. Test pass, redir and cork with + * apply logic. Use cork size of 4097 with send_large to avoid + * aligning cork size with send size. + */ +static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt) +{ txmsg_pass = 1; txmsg_redir = 0; + txmsg_cork = 4097; + txmsg_apply = 4097; + test_send_large(opt, cgrp); + + txmsg_pass = 0; + txmsg_redir = 1; txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 4097; + test_send_large(opt, cgrp); - txmsg_pass = 1; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 4097; + txmsg_cork = 4097; + test_send_large(opt, cgrp); +} + +static void test_txmsg_pull(int cgrp, struct sockmap_options *opt) +{ + /* Test basic start/end */ + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); + + /* Test >4k pull */ + txmsg_start = 4096; + txmsg_end = 9182; + test_send_large(opt, cgrp); + + /* Test pull + redirect */ txmsg_redir = 0; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_start = 1; + txmsg_end = 2; + test_send(opt, cgrp); - txmsg_pass = 1; + /* Test pull + cork */ txmsg_redir = 0; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; + /* Test pull + cork + redirect */ txmsg_redir = 1; - txmsg_apply = 1; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_cork = 512; + txmsg_start = 1; + txmsg_end = 2; + test_send_many(opt, cgrp); +} - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1; - err = test_send(&opt, cgrp); - if (err) - goto out; +static void test_txmsg_pop(int cgrp, struct sockmap_options *opt) +{ + /* Test basic pop */ + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 0; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Test pop with >4k */ + txmsg_start_pop = 4096; + txmsg_pop = 4096; + test_send_large(opt, cgrp); - txmsg_pass = 0; + /* Test pop + redirect */ txmsg_redir = 1; - txmsg_apply = 0; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; - txmsg_redir = 1; - txmsg_apply = 1024; - txmsg_cork = 1024; - err = test_send(&opt, cgrp); - if (err) - goto out; + /* Test pop + cork */ + txmsg_redir = 0; + txmsg_cork = 512; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); - txmsg_pass = 0; + /* Test pop + redirect + cork */ txmsg_redir = 1; - txmsg_cork = 4096; - txmsg_apply = 4096; - err = test_send(&opt, cgrp); - if (err) - goto out; -out: - return err; + txmsg_cork = 4; + txmsg_start_pop = 1; + txmsg_pop = 2; + test_send_many(opt, cgrp); } -static int test_start_end(int cgrp) +static void test_txmsg_push(int cgrp, struct sockmap_options *opt) { - struct sockmap_options opt = {0}; - int err, i; + /* Test basic push */ + txmsg_start_push = 1; + txmsg_end_push = 1; + test_send(opt, cgrp); - /* Test basic start/end with lots of iov_count and iov_lengths */ - txmsg_start = 1; - txmsg_end = 2; + /* Test push 4kB >4k */ + txmsg_start_push = 4096; + txmsg_end_push = 4096; + test_send_large(opt, cgrp); + + /* Test push + redirect */ + txmsg_redir = 1; txmsg_start_push = 1; txmsg_end_push = 2; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; + test_send_many(opt, cgrp); - /* Cut a byte of pushed data but leave reamining in place */ - txmsg_start = 1; - txmsg_end = 2; + /* Test push + cork */ + txmsg_redir = 0; + txmsg_cork = 512; txmsg_start_push = 1; - txmsg_end_push = 3; - txmsg_start_pop = 1; - txmsg_pop = 1; - err = test_txmsg(cgrp); - if (err) - goto out; - - /* Test start/end with cork */ - opt.rate = 16; - opt.iov_count = 1; - opt.iov_length = 100; - txmsg_cork = 1600; - - txmsg_start_pop = 0; - txmsg_pop = 0; - - for (i = 99; i <= 1600; i += 500) { - txmsg_start = 0; - txmsg_end = i; - txmsg_start_push = 0; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - - /* Test pop data in middle of cork */ - for (i = 99; i <= 1600; i += 500) { - txmsg_start_pop = 10; - txmsg_pop = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end with cork but pull data in middle */ - for (i = 199; i <= 1600; i += 500) { - txmsg_start = 100; - txmsg_end = i; - txmsg_start_push = 100; - txmsg_end_push = i; - err = test_exec(cgrp, &opt); - if (err) - goto out; - } - - /* Test start/end with cork pulling last sg entry */ - txmsg_start = 1500; - txmsg_end = 1600; - txmsg_start_push = 1500; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_end_push = 2; + test_send_many(opt, cgrp); +} - /* Test pop with cork pulling last sg entry */ - txmsg_start_pop = 1500; - txmsg_pop = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; - txmsg_start_pop = 0; - txmsg_pop = 0; - - /* Test start/end pull of single byte in last page */ - txmsg_start = 1111; - txmsg_end = 1112; - txmsg_start_push = 1111; - txmsg_end_push = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt) +{ + txmsg_start_push = 1; + txmsg_end_push = 10; + txmsg_start_pop = 5; + txmsg_pop = 4; + test_send_large(opt, cgrp); +} - /* Test pop of single byte in last page */ - txmsg_start_pop = 1111; - txmsg_pop = 1112; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_apply(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); - /* Test start/end with end < start */ - txmsg_start = 1111; - txmsg_end = 0; - txmsg_start_push = 1111; - txmsg_end_push = 0; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1; + txmsg_cork = 0; + test_send_one(opt, cgrp); - /* Test start/end with end > data */ - txmsg_start = 0; - txmsg_end = 1601; - txmsg_start_push = 0; - txmsg_end_push = 1601; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); - /* Test start/end with start > data */ - txmsg_start = 1601; - txmsg_end = 1600; - txmsg_start_push = 1601; - txmsg_end_push = 1600; - err = test_exec(cgrp, &opt); - if (err) - goto out; + txmsg_pass = 0; + txmsg_redir = 1; + txmsg_apply = 1024; + txmsg_cork = 0; + test_send_large(opt, cgrp); +} - /* Test pop with start > data */ - txmsg_start_pop = 1601; - txmsg_pop = 1; - err = test_exec(cgrp, &opt); - if (err) - goto out; +static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 0; + txmsg_cork = 1; + test_send(opt, cgrp); - /* Test pop with pop range > data */ - txmsg_start_pop = 1599; - txmsg_pop = 10; - err = test_exec(cgrp, &opt); -out: - txmsg_start = 0; - txmsg_end = 0; - sched_yield(); - return err; + txmsg_pass = 1; + txmsg_redir = 0; + txmsg_apply = 1; + txmsg_cork = 1; + test_send(opt, cgrp); } char *map_names[] = { @@ -1662,73 +1615,116 @@ static int populate_progs(char *bpf_file) return 0; } -static int __test_suite(int cg_fd, char *bpf_file) +struct _test test[] = { + {"txmsg test passthrough", test_txmsg_pass}, + {"txmsg test redirect", test_txmsg_redir}, + {"txmsg test drop", test_txmsg_drop}, + {"txmsg test ingress redirect", test_txmsg_ingress_redir}, + {"txmsg test apply", test_txmsg_apply}, + {"txmsg test cork", test_txmsg_cork}, + {"txmsg test hanging corks", test_txmsg_cork_hangs}, + {"txmsg test push_data", test_txmsg_push}, + {"txmsg test pull-data", test_txmsg_pull}, + {"txmsg test pop-data", test_txmsg_pop}, + {"txmsg test push/pop data", test_txmsg_push_pop}, +}; + +static int check_whitelist(struct _test *t, struct sockmap_options *opt) { - int err, cleanup = cg_fd; + char *entry, *ptr; + + if (!opt->whitelist) + return 0; + ptr = strdup(opt->whitelist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} - err = populate_progs(bpf_file); +static int check_blacklist(struct _test *t, struct sockmap_options *opt) +{ + char *entry, *ptr; + + if (!opt->blacklist) + return -EINVAL; + ptr = strdup(opt->blacklist); + if (!ptr) + return -ENOMEM; + entry = strtok(ptr, ","); + while (entry) { + if ((opt->prepend && strstr(opt->prepend, entry) != 0) || + strstr(opt->map, entry) != 0 || + strstr(t->title, entry) != 0) + return 0; + entry = strtok(NULL, ","); + } + return -EINVAL; +} + +static int __test_selftests(int cg_fd, struct sockmap_options *opt) +{ + int i, err; + + err = populate_progs(opt->map); if (err < 0) { fprintf(stderr, "ERROR: (%i) load bpf failed\n", err); return err; } - if (cg_fd < 0) { - if (setup_cgroup_environment()) { - fprintf(stderr, "ERROR: cgroup env failed\n"); - return -EINVAL; - } + /* Tests basic commands and APIs */ + for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) { + struct _test t = test[i]; - cg_fd = create_and_get_cgroup(CG_PATH); - if (cg_fd < 0) { - fprintf(stderr, - "ERROR: (%i) open cg path failed: %s\n", - cg_fd, optarg); - return cg_fd; - } + if (check_whitelist(&t, opt) != 0) + continue; + if (check_blacklist(&t, opt) == 0) + continue; - if (join_cgroup(CG_PATH)) { - fprintf(stderr, "ERROR: failed to join cgroup\n"); - return -EINVAL; - } + test_start_subtest(&t, opt); + t.tester(cg_fd, opt); + test_end_subtest(); } - /* Tests basic commands and APIs with range of iov values */ - txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0; - err = test_txmsg(cg_fd); - if (err) - goto out; + return err; +} - /* Tests interesting combinations of APIs used together */ - err = test_mixed(cg_fd); - if (err) - goto out; +static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKMAP_FILENAME; + __test_selftests(cg_fd, opt); +} - /* Tests pull_data API using start/end API */ - err = test_start_end(cg_fd); - if (err) - goto out; +static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + __test_selftests(cg_fd, opt); +} -out: - printf("Summary: %i PASSED %i FAILED\n", passed, failed); - if (cleanup < 0) { - cleanup_cgroup_environment(); - close(cg_fd); - } - return err; +static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) +{ + opt->map = BPF_SOCKHASH_FILENAME; + opt->prepend = "ktls"; + ktls = 1; + __test_selftests(cg_fd, opt); + ktls = 0; } -static int test_suite(int cg_fd) +static int test_selftest(int cg_fd, struct sockmap_options *opt) { - int err; - err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME); - if (err) - goto out; - err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME); -out: - if (cg_fd > -1) - close(cg_fd); - return err; + test_selftests_sockmap(cg_fd, opt); + test_selftests_sockhash(cg_fd, opt); + test_selftests_ktls(cg_fd, opt); + test_print_results(); + return 0; } int main(int argc, char **argv) @@ -1737,12 +1733,10 @@ int main(int argc, char **argv) struct sockmap_options options = {0}; int opt, longindex, err, cg_fd = 0; char *bpf_file = BPF_SOCKMAP_FILENAME; - int test = PING_PONG; + int test = SELFTESTS; + bool cg_created = 0; - if (argc < 2) - return test_suite(-1); - - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:", + while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:", long_options, &longindex)) != -1) { switch (opt) { case 's': @@ -1783,6 +1777,8 @@ int main(int argc, char **argv) break; case 'v': options.verbose = 1; + if (optarg) + options.verbose = atoi(optarg); break; case 'i': iov_count = atoi(optarg); @@ -1809,6 +1805,15 @@ int main(int argc, char **argv) return -1; } break; + case 'n': + options.whitelist = strdup(optarg); + if (!options.whitelist) + return -ENOMEM; + break; + case 'b': + options.blacklist = strdup(optarg); + if (!options.blacklist) + return -ENOMEM; case 0: break; case 'h': @@ -1818,13 +1823,30 @@ int main(int argc, char **argv) } } - if (argc <= 3 && cg_fd) - return test_suite(cg_fd); - if (!cg_fd) { - fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", - argv[0]); - return -1; + if (setup_cgroup_environment()) { + fprintf(stderr, "ERROR: cgroup env failed\n"); + return -EINVAL; + } + + cg_fd = create_and_get_cgroup(CG_PATH); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, strerror(errno)); + return cg_fd; + } + + if (join_cgroup(CG_PATH)) { + fprintf(stderr, "ERROR: failed to join cgroup\n"); + return -EINVAL; + } + cg_created = 1; + } + + if (test == SELFTESTS) { + err = test_selftest(cg_fd, &options); + goto out; } err = populate_progs(bpf_file); @@ -1843,6 +1865,13 @@ int main(int argc, char **argv) options.rate = rate; err = run_options(&options, cg_fd, test); +out: + if (options.whitelist) + free(options.whitelist); + if (options.blacklist) + free(options.blacklist); + if (cg_created) + cleanup_cgroup_environment(); close(cg_fd); return err; } diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 604b46151736..056e0273bf12 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -821,3 +821,36 @@ .result = REJECT, .errstr = "invalid mem access", }, +{ + "reference tracking: branch tracking valid pointer null comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, +{ + "reference tracking: branch tracking valid pointer value comparison", + .insns = { + BPF_SK_LOOKUP(sk_lookup_tcp), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .errstr = "Unreleased reference", + .result = REJECT, +}, diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c index 860d4a71cd83..3ecb70a3d939 100644 --- a/tools/testing/selftests/bpf/verifier/value_or_null.c +++ b/tools/testing/selftests/bpf/verifier/value_or_null.c @@ -150,3 +150,22 @@ .result_unpriv = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, +{ + "map lookup and null branch prediction", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2), + BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 4 }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, +}, |