summaryrefslogtreecommitdiff
path: root/net/xdp
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-06 18:39:49 -0700
commit1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch)
treedcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net/xdp
parent285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff)
parent7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add Maglev hashing scheduler to IPVS, from Inju Song. 2) Lots of new TC subsystem tests from Roman Mashak. 3) Add TCP zero copy receive and fix delayed acks and autotuning with SO_RCVLOWAT, from Eric Dumazet. 4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard Brouer. 5) Add ttl inherit support to vxlan, from Hangbin Liu. 6) Properly separate ipv6 routes into their logically independant components. fib6_info for the routing table, and fib6_nh for sets of nexthops, which thus can be shared. From David Ahern. 7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP messages from XDP programs. From Nikita V. Shirokov. 8) Lots of long overdue cleanups to the r8169 driver, from Heiner Kallweit. 9) Add BTF ("BPF Type Format"), from Martin KaFai Lau. 10) Add traffic condition monitoring to iwlwifi, from Luca Coelho. 11) Plumb extack down into fib_rules, from Roopa Prabhu. 12) Add Flower classifier offload support to igb, from Vinicius Costa Gomes. 13) Add UDP GSO support, from Willem de Bruijn. 14) Add documentation for eBPF helpers, from Quentin Monnet. 15) Add TLS tx offload to mlx5, from Ilya Lesokhin. 16) Allow applications to be given the number of bytes available to read on a socket via a control message returned from recvmsg(), from Soheil Hassas Yeganeh. 17) Add x86_32 eBPF JIT compiler, from Wang YanQing. 18) Add AF_XDP sockets, with zerocopy support infrastructure as well. From Björn Töpel. 19) Remove indirect load support from all of the BPF JITs and handle these operations in the verifier by translating them into native BPF instead. From Daniel Borkmann. 20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha. 21) Allow XDP programs to do lookups in the main kernel routing tables for forwarding. From David Ahern. 22) Allow drivers to store hardware state into an ELF section of kernel dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy. 23) Various RACK and loss detection improvements in TCP, from Yuchung Cheng. 24) Add TCP SACK compression, from Eric Dumazet. 25) Add User Mode Helper support and basic bpfilter infrastructure, from Alexei Starovoitov. 26) Support ports and protocol values in RTM_GETROUTE, from Roopa Prabhu. 27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard Brouer. 28) Add lots of forwarding selftests, from Petr Machata. 29) Add generic network device failover driver, from Sridhar Samudrala. * ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits) strparser: Add __strp_unpause and use it in ktls. rxrpc: Fix terminal retransmission connection ID to include the channel net: hns3: Optimize PF CMDQ interrupt switching process net: hns3: Fix for VF mailbox receiving unknown message net: hns3: Fix for VF mailbox cannot receiving PF response bnx2x: use the right constant Revert "net: sched: cls: Fix offloading when ingress dev is vxlan" net: dsa: b53: Fix for brcm tag issue in Cygnus SoC enic: fix UDP rss bits netdev-FAQ: clarify DaveM's position for stable backports rtnetlink: validate attributes in do_setlink() mlxsw: Add extack messages for port_{un, }split failures netdevsim: Add extack error message for devlink reload devlink: Add extack to reload and port_{un, }split operations net: metrics: add proper netlink validation ipmr: fix error path when ipmr_new_table fails ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds net: hns3: remove unused hclgevf_cfg_func_mta_filter netfilter: provide udp*_lib_lookup for nf_tproxy qed*: Utilize FW 8.37.2.0 ...
Diffstat (limited to 'net/xdp')
-rw-r--r--net/xdp/Kconfig7
-rw-r--r--net/xdp/Makefile1
-rw-r--r--net/xdp/xdp_umem.c361
-rw-r--r--net/xdp/xdp_umem.h30
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c788
-rw-r--r--net/xdp/xsk_queue.c63
-rw-r--r--net/xdp/xsk_queue.h265
8 files changed, 1529 insertions, 0 deletions
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
+config XDP_SOCKETS
+ bool "XDP sockets"
+ depends on BPF_SYSCALL
+ default n
+ help
+ XDP sockets allows a channel between XDP programs and
+ userspace applications.
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..04f073146256
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..7eb4948a38d2
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+#include "xsk_queue.h"
+
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
+
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&umem->xsk_list_lock, flags);
+ list_add_rcu(&xs->list, &umem->xsk_list);
+ spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
+
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+ unsigned long flags;
+
+ if (xs->dev) {
+ spin_lock_irqsave(&umem->xsk_list_lock, flags);
+ list_del_rcu(&xs->list);
+ spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+
+ if (umem->zc)
+ synchronize_net();
+ }
+}
+
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+ u32 queue_id, u16 flags)
+{
+ bool force_zc, force_copy;
+ struct netdev_bpf bpf;
+ int err;
+
+ force_zc = flags & XDP_ZEROCOPY;
+ force_copy = flags & XDP_COPY;
+
+ if (force_zc && force_copy)
+ return -EINVAL;
+
+ if (force_copy)
+ return 0;
+
+ dev_hold(dev);
+
+ if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
+ bpf.command = XDP_QUERY_XSK_UMEM;
+
+ rtnl_lock();
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ rtnl_unlock();
+
+ if (err) {
+ dev_put(dev);
+ return force_zc ? -ENOTSUPP : 0;
+ }
+
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = umem;
+ bpf.xsk.queue_id = queue_id;
+
+ rtnl_lock();
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ rtnl_unlock();
+
+ if (err) {
+ dev_put(dev);
+ return force_zc ? err : 0; /* fail or fallback */
+ }
+
+ umem->dev = dev;
+ umem->queue_id = queue_id;
+ umem->zc = true;
+ return 0;
+ }
+
+ dev_put(dev);
+ return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+static void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+ struct netdev_bpf bpf;
+ int err;
+
+ if (umem->dev) {
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = NULL;
+ bpf.xsk.queue_id = umem->queue_id;
+
+ rtnl_lock();
+ err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+ rtnl_unlock();
+
+ if (err)
+ WARN(1, "failed to disable umem!\n");
+
+ dev_put(umem->dev);
+ umem->dev = NULL;
+ }
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+ unsigned int i;
+
+ for (i = 0; i < umem->npgs; i++) {
+ struct page *page = umem->pgs[i];
+
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+ atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+ free_uid(umem->user);
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ xdp_umem_clear_dev(umem);
+
+ if (umem->fq) {
+ xskq_destroy(umem->fq);
+ umem->fq = NULL;
+ }
+
+ if (umem->cq) {
+ xskq_destroy(umem->cq);
+ umem->cq = NULL;
+ }
+
+ xdp_umem_unpin_pages(umem);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ mmput(mm);
+ kfree(umem->pages);
+ umem->pages = NULL;
+
+ xdp_umem_unaccount_pages(umem);
+out:
+ kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+ struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+ xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+ refcount_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+ if (!umem)
+ return;
+
+ if (refcount_dec_and_test(&umem->users)) {
+ INIT_WORK(&umem->work, xdp_umem_release_deferred);
+ schedule_work(&umem->work);
+ }
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+ unsigned int gup_flags = FOLL_WRITE;
+ long npgs;
+ int err;
+
+ umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+ if (!umem->pgs)
+ return -ENOMEM;
+
+ down_write(&current->mm->mmap_sem);
+ npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags, &umem->pgs[0], NULL);
+ up_write(&current->mm->mmap_sem);
+
+ if (npgs != umem->npgs) {
+ if (npgs >= 0) {
+ umem->npgs = npgs;
+ err = -ENOMEM;
+ goto out_pin;
+ }
+ err = npgs;
+ goto out_pgs;
+ }
+ return 0;
+
+out_pin:
+ xdp_umem_unpin_pages(umem);
+out_pgs:
+ kfree(umem->pgs);
+ umem->pgs = NULL;
+ return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+ unsigned long lock_limit, new_npgs, old_npgs;
+
+ if (capable(CAP_IPC_LOCK))
+ return 0;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ umem->user = get_uid(current_user());
+
+ do {
+ old_npgs = atomic_long_read(&umem->user->locked_vm);
+ new_npgs = old_npgs + umem->npgs;
+ if (new_npgs > lock_limit) {
+ free_uid(umem->user);
+ umem->user = NULL;
+ return -ENOBUFS;
+ }
+ } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+ new_npgs) != old_npgs);
+ return 0;
+}
+
+static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+ u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+ unsigned int chunks, chunks_per_page;
+ u64 addr = mr->addr, size = mr->len;
+ int size_chk, err, i;
+
+ if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
+ /* Strictly speaking we could support this, if:
+ * - huge pages, or*
+ * - using an IOMMU, or
+ * - making sure the memory area is consecutive
+ * but for now, we simply say "computer says no".
+ */
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(chunk_size))
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(addr)) {
+ /* Memory area has to be page size aligned. For
+ * simplicity, this might change.
+ */
+ return -EINVAL;
+ }
+
+ if ((addr + size) < addr)
+ return -EINVAL;
+
+ chunks = (unsigned int)div_u64(size, chunk_size);
+ if (chunks == 0)
+ return -EINVAL;
+
+ chunks_per_page = PAGE_SIZE / chunk_size;
+ if (chunks < chunks_per_page || chunks % chunks_per_page)
+ return -EINVAL;
+
+ headroom = ALIGN(headroom, 64);
+
+ size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
+ if (size_chk < 0)
+ return -EINVAL;
+
+ umem->pid = get_task_pid(current, PIDTYPE_PID);
+ umem->address = (unsigned long)addr;
+ umem->props.chunk_mask = ~((u64)chunk_size - 1);
+ umem->props.size = size;
+ umem->headroom = headroom;
+ umem->chunk_size_nohr = chunk_size - headroom;
+ umem->npgs = size / PAGE_SIZE;
+ umem->pgs = NULL;
+ umem->user = NULL;
+ INIT_LIST_HEAD(&umem->xsk_list);
+ spin_lock_init(&umem->xsk_list_lock);
+
+ refcount_set(&umem->users, 1);
+
+ err = xdp_umem_account_pages(umem);
+ if (err)
+ goto out;
+
+ err = xdp_umem_pin_pages(umem);
+ if (err)
+ goto out_account;
+
+ umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+ if (!umem->pages) {
+ err = -ENOMEM;
+ goto out_account;
+ }
+
+ for (i = 0; i < umem->npgs; i++)
+ umem->pages[i].addr = page_address(umem->pgs[i]);
+
+ return 0;
+
+out_account:
+ xdp_umem_unaccount_pages(umem);
+out:
+ put_pid(umem->pid);
+ return err;
+}
+
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
+{
+ struct xdp_umem *umem;
+ int err;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ err = xdp_umem_reg(umem, mr);
+ if (err) {
+ kfree(umem);
+ return ERR_PTR(err);
+ }
+
+ return umem;
+}
+
+bool xdp_umem_validate_queues(struct xdp_umem *umem)
+{
+ return umem->fq && umem->cq;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..f11560334f88
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <net/xdp_sock.h>
+
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
+{
+ return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+}
+
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+ return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
+
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+ u32 queue_id, u16 flags);
+bool xdp_umem_validate_queues(struct xdp_umem *umem);
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem);
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..40eab10dfc49
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+ u64 chunk_mask;
+ u64 size;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..c6ed2454f7ce
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,788 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ * Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <net/xdp_sock.h>
+#include <net/xdp.h>
+
+#include "xsk_queue.h"
+#include "xdp_umem.h"
+
+#define TX_BATCH_SIZE 16
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+ return (struct xdp_sock *)sk;
+}
+
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+ return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
+ READ_ONCE(xs->umem->fq);
+}
+
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+ return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+ xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ void *buffer;
+ u64 addr;
+ int err;
+
+ if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ len > xs->umem->chunk_size_nohr) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ addr += xs->umem->headroom;
+
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ memcpy(buffer, xdp->data, len);
+ err = xskq_produce_batch_desc(xs->rx, addr, len);
+ if (!err) {
+ xskq_discard_addr(xs->umem->fq);
+ xdp_return_buff(xdp);
+ return 0;
+ }
+
+ xs->rx_dropped++;
+ return err;
+}
+
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+{
+ int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
+
+ if (err) {
+ xdp_return_buff(xdp);
+ xs->rx_dropped++;
+ }
+
+ return err;
+}
+
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len;
+
+ if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+ return -EINVAL;
+
+ len = xdp->data_end - xdp->data;
+
+ return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+ __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
+void xsk_flush(struct xdp_sock *xs)
+{
+ xskq_produce_flush_desc(xs->rx);
+ xs->sk.sk_data_ready(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ u32 len = xdp->data_end - xdp->data;
+ void *buffer;
+ u64 addr;
+ int err;
+
+ if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ len > xs->umem->chunk_size_nohr) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ addr += xs->umem->headroom;
+
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ memcpy(buffer, xdp->data, len);
+ err = xskq_produce_batch_desc(xs->rx, addr, len);
+ if (!err) {
+ xskq_discard_addr(xs->umem->fq);
+ xsk_flush(xs);
+ return 0;
+ }
+
+ xs->rx_dropped++;
+ return err;
+}
+
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+ xskq_produce_flush_addr_n(umem->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_umem_complete_tx);
+
+void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->sk.sk_write_space(&xs->sk);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+ struct xdp_desc desc;
+ struct xdp_sock *xs;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ if (!xskq_peek_desc(xs->tx, &desc))
+ continue;
+
+ if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+ goto out;
+
+ *dma = xdp_umem_get_dma(umem, desc.addr);
+ *len = desc.len;
+
+ xskq_discard_desc(xs->tx);
+ rcu_read_unlock();
+ return true;
+ }
+
+out:
+ rcu_read_unlock();
+ return false;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx);
+
+static int xsk_zc_xmit(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net_device *dev = xs->dev;
+
+ return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+}
+
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+ u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
+ struct xdp_sock *xs = xdp_sk(skb->sk);
+
+ WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+
+ sock_wfree(skb);
+}
+
+static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
+ size_t total_len)
+{
+ u32 max_batch = TX_BATCH_SIZE;
+ struct xdp_sock *xs = xdp_sk(sk);
+ bool sent_frame = false;
+ struct xdp_desc desc;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (unlikely(!xs->tx))
+ return -ENOBUFS;
+
+ mutex_lock(&xs->mutex);
+
+ while (xskq_peek_desc(xs->tx, &desc)) {
+ char *buffer;
+ u64 addr;
+ u32 len;
+
+ if (max_batch-- == 0) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ if (xskq_reserve_addr(xs->umem->cq)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ len = desc.len;
+ if (unlikely(len > xs->dev->mtu)) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ if (xs->queue_id >= xs->dev->real_num_tx_queues) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ skb = sock_alloc_send_skb(sk, len, 1, &err);
+ if (unlikely(!skb)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ skb_put(skb, len);
+ addr = desc.addr;
+ buffer = xdp_umem_get_data(xs->umem, addr);
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ skb->dev = xs->dev;
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb->destructor = xsk_destruct_skb;
+
+ err = dev_direct_xmit(skb, xs->queue_id);
+ /* Ignore NET_XMIT_CN as packet might have been sent */
+ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
+ err = -EAGAIN;
+ /* SKB consumed by dev_direct_xmit() */
+ goto out;
+ }
+
+ sent_frame = true;
+ xskq_discard_desc(xs->tx);
+ }
+
+out:
+ if (sent_frame)
+ sk->sk_write_space(sk);
+
+ mutex_unlock(&xs->mutex);
+ return err;
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+ bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (unlikely(!xs->dev))
+ return -ENXIO;
+ if (unlikely(!(xs->dev->flags & IFF_UP)))
+ return -ENETDOWN;
+ if (need_wait)
+ return -EOPNOTSUPP;
+
+ return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
+}
+
+static __poll_t xsk_poll_mask(struct socket *sock, __poll_t events)
+{
+ __poll_t mask = datagram_poll_mask(sock, events);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (xs->rx && !xskq_empty_desc(xs->rx))
+ mask |= POLLIN | POLLRDNORM;
+ if (xs->tx && !xskq_full_desc(xs->tx))
+ mask |= POLLOUT | POLLWRNORM;
+
+ return mask;
+}
+
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+ bool umem_queue)
+{
+ struct xsk_queue *q;
+
+ if (entries == 0 || *queue || !is_power_of_2(entries))
+ return -EINVAL;
+
+ q = xskq_create(entries, umem_queue);
+ if (!q)
+ return -ENOMEM;
+
+ /* Make sure queue is ready before it can be seen by others */
+ smp_wmb();
+ *queue = q;
+ return 0;
+}
+
+static int xsk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ local_bh_enable();
+
+ if (xs->dev) {
+ /* Wait for driver to stop using the xdp socket. */
+ synchronize_net();
+ dev_put(xs->dev);
+ xs->dev = NULL;
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ sk_refcnt_debug_release(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+ struct socket *sock;
+ int err;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return ERR_PTR(-ENOTSOCK);
+
+ if (sock->sk->sk_family != PF_XDP) {
+ sockfd_put(sock);
+ return ERR_PTR(-ENOPROTOOPT);
+ }
+
+ return sock;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ struct net_device *dev;
+ u32 flags, qid;
+ int err = 0;
+
+ if (addr_len < sizeof(struct sockaddr_xdp))
+ return -EINVAL;
+ if (sxdp->sxdp_family != AF_XDP)
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->dev) {
+ err = -EBUSY;
+ goto out_release;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_release;
+ }
+
+ if (!xs->rx && !xs->tx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ qid = sxdp->sxdp_queue_id;
+
+ if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+ (xs->tx && qid >= dev->real_num_tx_queues)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ flags = sxdp->sxdp_flags;
+
+ if (flags & XDP_SHARED_UMEM) {
+ struct xdp_sock *umem_xs;
+ struct socket *sock;
+
+ if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+ /* Cannot specify flags for shared sockets. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (xs->umem) {
+ /* We have already our own. */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+ if (IS_ERR(sock)) {
+ err = PTR_ERR(sock);
+ goto out_unlock;
+ }
+
+ umem_xs = xdp_sk(sock->sk);
+ if (!umem_xs->umem) {
+ /* No umem to inherit. */
+ err = -EBADF;
+ sockfd_put(sock);
+ goto out_unlock;
+ } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
+ xdp_get_umem(umem_xs->umem);
+ xs->umem = umem_xs->umem;
+ sockfd_put(sock);
+ } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+ err = -EINVAL;
+ goto out_unlock;
+ } else {
+ /* This xsk has its own umem. */
+ xskq_set_umem(xs->umem->fq, &xs->umem->props);
+ xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+ err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+ if (err)
+ goto out_unlock;
+ }
+
+ xs->dev = dev;
+ xs->zc = xs->umem->zc;
+ xs->queue_id = qid;
+ xskq_set_umem(xs->rx, &xs->umem->props);
+ xskq_set_umem(xs->tx, &xs->umem->props);
+ xdp_add_sk_umem(xs->umem, xs);
+
+out_unlock:
+ if (err)
+ dev_put(dev);
+out_release:
+ mutex_unlock(&xs->mutex);
+ return err;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int err;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ switch (optname) {
+ case XDP_RX_RING:
+ case XDP_TX_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (optlen < sizeof(entries))
+ return -EINVAL;
+ if (copy_from_user(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
+ err = xsk_init_queue(entries, q, false);
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ case XDP_UMEM_REG:
+ {
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+
+ if (copy_from_user(&mr, optval, sizeof(mr)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (xs->umem) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+
+ umem = xdp_umem_create(&mr);
+ if (IS_ERR(umem)) {
+ mutex_unlock(&xs->mutex);
+ return PTR_ERR(umem);
+ }
+
+ /* Make sure umem is ready before it can be seen by others */
+ smp_wmb();
+ xs->umem = umem;
+ mutex_unlock(&xs->mutex);
+ return 0;
+ }
+ case XDP_UMEM_FILL_RING:
+ case XDP_UMEM_COMPLETION_RING:
+ {
+ struct xsk_queue **q;
+ int entries;
+
+ if (copy_from_user(&entries, optval, sizeof(entries)))
+ return -EFAULT;
+
+ mutex_lock(&xs->mutex);
+ if (!xs->umem) {
+ mutex_unlock(&xs->mutex);
+ return -EINVAL;
+ }
+
+ q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
+ &xs->umem->cq;
+ err = xsk_init_queue(entries, q, true);
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
+ default:
+ break;
+ }
+
+ return -ENOPROTOOPT;
+}
+
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+ int len;
+
+ if (level != SOL_XDP)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case XDP_STATISTICS:
+ {
+ struct xdp_statistics stats;
+
+ if (len < sizeof(stats))
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ stats.rx_dropped = xs->rx_dropped;
+ stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+ stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+ mutex_unlock(&xs->mutex);
+
+ if (copy_to_user(optval, &stats, sizeof(stats)))
+ return -EFAULT;
+ if (put_user(sizeof(stats), optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ case XDP_MMAP_OFFSETS:
+ {
+ struct xdp_mmap_offsets off;
+
+ if (len < sizeof(off))
+ return -EINVAL;
+
+ off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
+ off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
+
+ off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ off.fr.desc = offsetof(struct xdp_umem_ring, desc);
+ off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ off.cr.desc = offsetof(struct xdp_umem_ring, desc);
+
+ len = sizeof(off);
+ if (copy_to_user(optval, &off, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
+ default:
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int xsk_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct xdp_sock *xs = xdp_sk(sock->sk);
+ struct xsk_queue *q = NULL;
+ struct xdp_umem *umem;
+ unsigned long pfn;
+ struct page *qpg;
+
+ if (offset == XDP_PGOFF_RX_RING) {
+ q = READ_ONCE(xs->rx);
+ } else if (offset == XDP_PGOFF_TX_RING) {
+ q = READ_ONCE(xs->tx);
+ } else {
+ umem = READ_ONCE(xs->umem);
+ if (!umem)
+ return -EINVAL;
+
+ if (offset == XDP_UMEM_PGOFF_FILL_RING)
+ q = READ_ONCE(umem->fq);
+ else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+ q = READ_ONCE(umem->cq);
+ }
+
+ if (!q)
+ return -EINVAL;
+
+ qpg = virt_to_head_page(q->ring);
+ if (size > (PAGE_SIZE << compound_order(qpg)))
+ return -EINVAL;
+
+ pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+ return remap_pfn_range(vma, vma->vm_start, pfn,
+ size, vma->vm_page_prot);
+}
+
+static struct proto xsk_proto = {
+ .name = "XDP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+ .family = PF_XDP,
+ .owner = THIS_MODULE,
+ .release = xsk_release,
+ .bind = xsk_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll_mask = xsk_poll_mask,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = xsk_setsockopt,
+ .getsockopt = xsk_getsockopt,
+ .sendmsg = xsk_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .mmap = xsk_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ xskq_destroy(xs->rx);
+ xskq_destroy(xs->tx);
+ xdp_del_sk_umem(xs->umem, xs);
+ xdp_put_umem(xs->umem);
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct xdp_sock *xs;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+ if (!sk)
+ return -ENOBUFS;
+
+ sock->ops = &xsk_proto_ops;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_family = PF_XDP;
+
+ sk->sk_destruct = xsk_destruct;
+ sk_refcnt_debug_inc(sk);
+
+ xs = xdp_sk(sk);
+ mutex_init(&xs->mutex);
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, &xsk_proto, 1);
+ local_bh_enable();
+
+ return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+ .family = PF_XDP,
+ .create = xsk_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+ int err;
+
+ err = proto_register(&xsk_proto, 0 /* no slab */);
+ if (err)
+ goto out;
+
+ err = sock_register(&xsk_family_ops);
+ if (err)
+ goto out_proto;
+
+ return 0;
+
+out_proto:
+ proto_unregister(&xsk_proto);
+out:
+ return err;
+}
+
+fs_initcall(xsk_init);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..6c32e92e98fc
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+{
+ if (!q)
+ return;
+
+ q->umem_props = *umem_props;
+}
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+ return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
+}
+
+static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
+{
+ return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
+{
+ struct xsk_queue *q;
+ gfp_t gfp_flags;
+ size_t size;
+
+ q = kzalloc(sizeof(*q), GFP_KERNEL);
+ if (!q)
+ return NULL;
+
+ q->nentries = nentries;
+ q->ring_mask = nentries - 1;
+
+ gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+ __GFP_COMP | __GFP_NORETRY;
+ size = umem_queue ? xskq_umem_get_ring_size(q) :
+ xskq_rxtx_get_ring_size(q);
+
+ q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+ get_order(size));
+ if (!q->ring) {
+ kfree(q);
+ return NULL;
+ }
+
+ return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+ if (!q)
+ return;
+
+ page_frag_free(q->ring);
+ kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..ef6a6f0ec949
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+#include <net/xdp_sock.h>
+
+#define RX_BATCH_SIZE 16
+#define LAZY_UPDATE_THRESHOLD 128
+
+struct xdp_ring {
+ u32 producer ____cacheline_aligned_in_smp;
+ u32 consumer ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+ struct xdp_ring ptrs;
+ struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+ struct xdp_ring ptrs;
+ u64 desc[0] ____cacheline_aligned_in_smp;
+};
+
+struct xsk_queue {
+ struct xdp_umem_props umem_props;
+ u32 ring_mask;
+ u32 nentries;
+ u32 prod_head;
+ u32 prod_tail;
+ u32 cons_head;
+ u32 cons_tail;
+ struct xdp_ring *ring;
+ u64 invalid_descs;
+};
+
+/* Common functions operating for both RXTX and umem queues */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+ return q ? q->invalid_descs : 0;
+}
+
+static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
+{
+ u32 entries = q->prod_tail - q->cons_tail;
+
+ if (entries == 0) {
+ /* Refresh the local pointer */
+ q->prod_tail = READ_ONCE(q->ring->producer);
+ entries = q->prod_tail - q->cons_tail;
+ }
+
+ return (entries > dcnt) ? dcnt : entries;
+}
+
+static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
+{
+ return q->nentries - (producer - q->cons_tail);
+}
+
+static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
+{
+ u32 free_entries = xskq_nb_free_lazy(q, producer);
+
+ if (free_entries >= dcnt)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cons_tail = READ_ONCE(q->ring->consumer);
+ return q->nentries - (producer - q->cons_tail);
+}
+
+/* UMEM queue */
+
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
+{
+ if (addr >= q->umem_props.size) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+{
+ while (q->cons_tail != q->cons_head) {
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ unsigned int idx = q->cons_tail & q->ring_mask;
+
+ *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+ if (xskq_is_valid_addr(q, *addr))
+ return addr;
+
+ q->cons_tail++;
+ }
+
+ return NULL;
+}
+
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+{
+ if (q->cons_tail == q->cons_head) {
+ WRITE_ONCE(q->ring->consumer, q->cons_tail);
+ q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+ /* Order consumer and data */
+ smp_rmb();
+ }
+
+ return xskq_validate_addr(q, addr);
+}
+
+static inline void xskq_discard_addr(struct xsk_queue *q)
+{
+ q->cons_tail++;
+}
+
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+ return -ENOSPC;
+
+ ring->desc[q->prod_tail++ & q->ring_mask] = addr;
+
+ /* Order producer and data */
+ smp_wmb();
+
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+ return 0;
+}
+
+static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
+ return -ENOSPC;
+
+ ring->desc[q->prod_head++ & q->ring_mask] = addr;
+ return 0;
+}
+
+static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
+ u32 nb_entries)
+{
+ /* Order producer and data */
+ smp_wmb();
+
+ q->prod_tail += nb_entries;
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline int xskq_reserve_addr(struct xsk_queue *q)
+{
+ if (xskq_nb_free(q, q->prod_head, 1) == 0)
+ return -ENOSPC;
+
+ q->prod_head++;
+ return 0;
+}
+
+/* Rx/Tx queue */
+
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+{
+ if (!xskq_is_valid_addr(q, d->addr))
+ return false;
+
+ if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+ (d->addr & q->umem_props.chunk_mask)) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+}
+
+static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
+ struct xdp_desc *desc)
+{
+ while (q->cons_tail != q->cons_head) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ unsigned int idx = q->cons_tail & q->ring_mask;
+
+ *desc = READ_ONCE(ring->desc[idx]);
+ if (xskq_is_valid_desc(q, desc))
+ return desc;
+
+ q->cons_tail++;
+ }
+
+ return NULL;
+}
+
+static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
+ struct xdp_desc *desc)
+{
+ if (q->cons_tail == q->cons_head) {
+ WRITE_ONCE(q->ring->consumer, q->cons_tail);
+ q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+ /* Order consumer and data */
+ smp_rmb();
+ }
+
+ return xskq_validate_desc(q, desc);
+}
+
+static inline void xskq_discard_desc(struct xsk_queue *q)
+{
+ q->cons_tail++;
+}
+
+static inline int xskq_produce_batch_desc(struct xsk_queue *q,
+ u64 addr, u32 len)
+{
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ unsigned int idx;
+
+ if (xskq_nb_free(q, q->prod_head, 1) == 0)
+ return -ENOSPC;
+
+ idx = (q->prod_head++) & q->ring_mask;
+ ring->desc[idx].addr = addr;
+ ring->desc[idx].len = len;
+
+ return 0;
+}
+
+static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+{
+ /* Order producer and data */
+ smp_wmb();
+
+ q->prod_tail = q->prod_head,
+ WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline bool xskq_full_desc(struct xsk_queue *q)
+{
+ return xskq_nb_avail(q, q->nentries) == q->nentries;
+}
+
+static inline bool xskq_empty_desc(struct xsk_queue *q)
+{
+ return xskq_nb_free(q, q->prod_tail, 1) == q->nentries;
+}
+
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
+void xskq_destroy(struct xsk_queue *q_ops);
+
+#endif /* _LINUX_XSK_QUEUE_H */