From af12fa6e46aa651e7b86a4c4117b562518fef184 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:41 +0300 Subject: net: add napi_id and hash Adds a napi_id and a hashing mechanism to lookup a napi by id. This will be used by subsequent patches to implement low latency Ethernet device polling. Based on a code sample by Eric Dumazet. Signed-off-by: Eliezer Tamir Signed-off-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 9c18557f93c6..fa007dba6beb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) @@ -4136,6 +4143,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { -- cgit v1.2.3 From 060212928670593fb89243640bf05cf89560b023 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:50 +0300 Subject: net: add low latency socket poll Adds an ndo_ll_poll method and the code that supports it. This method can be used by low latency applications to busy-poll Ethernet device queues directly from the socket code. sysctl_net_ll_poll controls how many microseconds to poll. Default is zero (disabled). Individual protocol support will be added by subsequent patches. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 7 ++ include/linux/netdevice.h | 3 + include/linux/skbuff.h | 8 ++- include/net/ll_poll.h | 148 +++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 4 ++ include/uapi/linux/snmp.h | 1 + net/Kconfig | 12 ++++ net/core/skbuff.c | 4 ++ net/core/sock.c | 6 ++ net/core/sysctl_net_core.c | 10 +++ net/ipv4/proc.c | 1 + net/socket.c | 6 ++ 12 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 include/net/ll_poll.h (limited to 'net') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index c1f8640c2fc8..85ab72dcdc3c 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 +low_latency_poll +---------------- +Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value is 50. May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39bbd462d68e..2ecb96d9a1e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -971,6 +971,9 @@ struct net_device_ops { struct netpoll_info *info, gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); +#endif +#ifdef CONFIG_NET_LL_RX_POLL + int (*ndo_ll_poll)(struct napi_struct *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9995834d2cb6..400d82ae2b03 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions + * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark * @dropcount: total number of sk_receive_queue overflows @@ -500,8 +501,11 @@ struct sk_buff { /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#ifdef CONFIG_NET_DMA - dma_cookie_t dma_cookie; +#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL + union { + unsigned int napi_id; + dma_cookie_t dma_cookie; + }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h new file mode 100644 index 000000000000..bc262f88173f --- /dev/null +++ b/include/net/ll_poll.h @@ -0,0 +1,148 @@ +/* + * Low Latency Sockets + * Copyright(c) 2013 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Author: Eliezer Tamir + * + * Contact Information: + * e1000-devel Mailing List + */ + +/* + * For now this depends on CONFIG_X86_TSC + */ + +#ifndef _LINUX_NET_LL_POLL_H +#define _LINUX_NET_LL_POLL_H + +#include +#include + +#ifdef CONFIG_NET_LL_RX_POLL + +struct napi_struct; +extern unsigned long sysctl_net_ll_poll __read_mostly; + +/* return values from ndo_ll_poll */ +#define LL_FLUSH_FAILED -1 +#define LL_FLUSH_BUSY -2 + +/* we don't mind a ~2.5% imprecision */ +#define TSC_MHZ (tsc_khz >> 10) + +static inline cycles_t ll_end_time(void) +{ + return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return sysctl_net_ll_poll && sk->sk_napi_id && + !need_resched() && !signal_pending(current); +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return !time_after((unsigned long)get_cycles(), + (unsigned long)end_time); +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + cycles_t end_time = ll_end_time(); + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_ll_poll) + goto out; + + do { + + rc = ops->ndo_ll_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_LOWLATENCYRXPACKETS, rc); + + } while (skb_queue_empty(&sk->sk_receive_queue) + && can_poll_ll(end_time) && !nonblock); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} + +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ + skb->napi_id = napi->napi_id; +} + +/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_napi_id = skb->napi_id; +} + +#else /* CONFIG_NET_LL_RX_POLL */ + +static inline cycles_t ll_end_time(void) +{ + return 0; +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return false; +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + return false; +} + +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ +} + +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return false; +} + +#endif /* CONFIG_NET_LL_RX_POLL */ +#endif /* _LINUX_NET_LL_POLL_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 66772cf8c3c5..ac8e1818380c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -229,6 +229,7 @@ struct cg_proto; * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward + * @sk_napi_id: id of the last napi context to receive data for sk * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, @@ -324,6 +325,9 @@ struct sock { int sk_forward_alloc; #ifdef CONFIG_RPS __u32 sk_rxhash; +#endif +#ifdef CONFIG_NET_LL_RX_POLL + unsigned int sk_napi_id; #endif atomic_t sk_drops; int sk_rcvbuf; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c03..26cbf76f8058 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,6 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ + LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/Kconfig b/net/Kconfig index 523e43e6da1b..d6a9ce6e1800 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -243,6 +243,18 @@ config NETPRIO_CGROUP Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis +config NET_LL_RX_POLL + bool "Low Latency Receive Poll" + depends on X86_TSC + default n + ---help--- + Support Low Latency Receive Queue Poll. + (For network card drivers which support this option.) + When waiting for data in read or poll call directly into the the device driver + to flush packets which may be pending on the device queues into the stack. + + If unsure, say N. + config BQL boolean depends on SYSFS diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73f57a0e1523..4a4181e16c1a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_LL_RX_POLL + new->napi_id = old->napi_id; +#endif } /* diff --git a/net/core/sock.c b/net/core/sock.c index 88868a9d21da..788c0da5eed1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include #endif +#include + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_LL_RX_POLL + sk->sk_napi_id = 0; +#endif + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 741db5fc7806..4b48f39582b0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,6 +19,7 @@ #include #include #include +#include static int one = 1; @@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_LL_RX_POLL + { + .procname = "low_latency_poll", + .data = &sysctl_net_ll_poll, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d2415..6577a1149a47 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), SNMP_MIB_SENTINEL }; diff --git a/net/socket.c b/net/socket.c index 3ebdcb805c51..21fd29f63ed2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,12 @@ #include #include #include +#include + +#ifdef CONFIG_NET_LL_RX_POLL +unsigned long sysctl_net_ll_poll __read_mostly; +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); +#endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, -- cgit v1.2.3 From a5b50476f77a8fcc8055c955720d05a7c2d9c532 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:40:00 +0300 Subject: udp: add low latency socket poll support Add upport for busy-polling on UDP sockets. In __udp[46]_lib_rcv add a call to sk_mark_ll() to copy the napi_id from the skb into the sk. This is done at the earliest possible moment, right after we identify which socket this skb is for. In __skb_recv_datagram When there is no data and the user tries to read we busy poll. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/datagram.c | 4 ++++ net/ipv4/udp.c | 6 +++++- net/ipv6/udp.c | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index b71423db7785..9cbaba98ce4c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,6 +56,7 @@ #include #include #include +#include /* * Is a socket 'connection oriented' ? @@ -207,6 +208,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + continue; + /* User doesn't want to wait */ error = -EAGAIN; if (!timeo) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c7338ec79cc0..2955b25aee6d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -109,6 +109,7 @@ #include #include #include +#include #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1709,7 +1710,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b5808539cd5c..f77e34c5a0e2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -841,7 +842,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udpv6_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but -- cgit v1.2.3 From d30e383bb856f614ddb5bbbb5a7d3f86240e41ec Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:40:10 +0300 Subject: tcp: add low latency socket poll support. Adds low latency socket poll support for TCP. In tcp_v[46]_rcv() add a call to sk_mark_ll() to copy the napi_id from the skb to the sk. In tcp_recvmsg(), when there is no data in the socket we busy-poll. This is a good example of how to add busy-poll support to more protocols. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +++++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv6/tcp_ipv6.c | 2 ++ 3 files changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bc4246940f6c..46ed9afd1f5e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include #include +#include int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; @@ -1553,6 +1554,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue) + && (sk->sk_state == TCP_ESTABLISHED)) + sk_poll_ll(sk, nonblock); + lock_sock(sk); err = -ENOTCONN; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 289039b4d8de..1063bb83e342 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -1993,6 +1994,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0a17ed9eaf39..5cffa5c3e6b8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -63,6 +63,7 @@ #include #include #include +#include #include @@ -1498,6 +1499,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_mark_ll(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); -- cgit v1.2.3