diff options
author | David S. Miller <davem@davemloft.net> | 2015-01-14 15:20:11 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-01-14 15:20:11 -0500 |
commit | 2733135329e9bbc306be9f58af1b4be92b359d23 (patch) | |
tree | ae1ac810501bacae10209b8faa94595fc10c3884 | |
parent | d823ab68fbb0fa504a2490bd499ac921bdf497d8 (diff) | |
parent | dfd8645ea1bd91277f841e74c33e1f4dbbede808 (diff) |
Merge branch 'vxlan_rco'
Tom Herbert says:
====================
net: Remote checksum offload for VXLAN
This patch set adds support for remote checksum offload in VXLAN.
The remote checksum offload is generalized by creating a common
function (remcsum_adjust) that does the work of modifying the
checksum in remote checksum offload. This function can be called
from normal or GRO path. GUE was modified to use this function.
To support RCO is VXLAN we use the 9th bit in the reserved
flags to indicated remote checksum offload. The start and offset
values are encoded n a compressed form in the low order (reserved)
byte of the vni field.
Remote checksum offload is described in
https://tools.ietf.org/html/draft-herbert-remotecsumoffload-01
Changes in v2:
- Add udp_offload_callbacks which has GRO functions that take a
udp_offload pointer argument. This argument can be used to retrieve
a per port structure of the encapsulation for use in gro processing
(mostly by doing container_of on the structure).
- Use the 10th bit in VXLAN flags for RCO which does not seem to
conflict with other proposals at this time (ie. VXLAN-GPE and
VXLAN-GPB)
- Require that RCO must be explicitly enabled on the receiver
as well as the sender.
Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4).
With UDP checksums and Remote Checksum Offload
IPv4
Client
11.84% CPU utilization
Server
12.96% CPU utilization
9197 Mbps
IPv6
Client
12.46% CPU utilization
Server
14.48% CPU utilization
8963 Mbps
With UDP checksums, no remote checksum offload
IPv4
Client
15.67% CPU utilization
Server
14.83% CPU utilization
9094 Mbps
IPv6
Client
16.21% CPU utilization
Server
14.32% CPU utilization
9058 Mbps
No UDP checksums
IPv4
Client
15.03% CPU utilization
Server
23.09% CPU utilization
9089 Mbps
IPv6
Client
16.18% CPU utilization
Server
26.57% CPU utilization
8954 Mbps
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/vxlan.c | 198 | ||||
-rw-r--r-- | include/linux/netdevice.h | 15 | ||||
-rw-r--r-- | include/net/vxlan.h | 11 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 2 | ||||
-rw-r--r-- | net/ipv4/fou.c | 12 | ||||
-rw-r--r-- | net/ipv4/geneve.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 7 |
7 files changed, 233 insertions, 18 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 985359dd6033..99df0d76157c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -539,12 +539,57 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data) +{ + size_t start, offset, plen; + __wsum delta; + + if (skb->remcsum_offload) + return vh; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + vh = skb_gro_header_slow(skb, off + plen, off); + if (!vh) + return NULL; + } + + delta = remcsum_adjust((void *)vh + hdrlen, + NAPI_GRO_CB(skb)->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + skb->remcsum_offload = 1; + + return vh; +} + +static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) { struct sk_buff *p, **pp = NULL; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; + struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, + udp_offloads); + u32 flags; off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); @@ -555,6 +600,19 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); + + flags = ntohl(vh->vx_flags); + + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), + ntohl(vh->vx_vni)); + + if (!vh) + goto out; + } + flush = 0; for (p = *head; p; p = p->next) { @@ -568,8 +626,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff } } - skb_gro_pull(skb, sizeof(struct vxlanhdr)); - skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); pp = eth_gro_receive(head, skb); out: @@ -578,7 +634,8 @@ out: return pp; } -static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { udp_tunnel_gro_complete(skb, nhoff); @@ -1084,6 +1141,42 @@ static void vxlan_igmp_leave(struct work_struct *work) dev_put(vxlan->dev); } +static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, + size_t hdrlen, u32 data) +{ + size_t start, offset, plen; + __wsum delta; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return vh; + } + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + if (!pskb_may_pull(skb, plen)) + return NULL; + + vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); + + delta = remcsum_adjust((void *)vh + hdrlen, + skb->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + + return vh; +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { @@ -1108,12 +1201,22 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) goto drop; + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - if (flags || (vni & 0xff)) { + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni); + if (!vxh) + goto drop; + + flags &= ~VXLAN_HF_RCO; + vni &= VXLAN_VID_MASK; + } + + if (flags || (vni & ~VXLAN_VID_MASK)) { /* If there are any unprocessed flags remaining treat * this as a malformed packet. This behavior diverges from * VXLAN RFC (RFC7348) which stipulates that bits in reserved @@ -1550,8 +1653,23 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, int min_headroom; int err; bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk); + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vs->flags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) { err = -EINVAL; goto err; @@ -1580,6 +1698,22 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, vxh->vx_flags = htonl(VXLAN_HF_VNI); vxh->vx_vni = vni; + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio, @@ -1600,8 +1734,23 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, int min_headroom; int err; bool udp_sum = !vs->sock->sk->sk_no_check_tx; + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vs->flags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -1624,6 +1773,22 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, vxh->vx_flags = htonl(VXLAN_HF_VNI); vxh->vx_vni = vni; + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos, @@ -2215,6 +2380,8 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -2336,6 +2503,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; + vs->flags = flags; /* Initialize the vxlan udp offloads structure */ vs->udp_offloads.port = port; @@ -2530,6 +2698,14 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; + if (data[IFLA_VXLAN_REMCSUM_TX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) + vxlan->flags |= VXLAN_F_REMCSUM_TX; + + if (data[IFLA_VXLAN_REMCSUM_RX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) + vxlan->flags |= VXLAN_F_REMCSUM_RX; + if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET, vxlan->dst_port)) { pr_info("duplicate VNI %u\n", vni); @@ -2598,6 +2774,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ 0; } @@ -2663,7 +2841,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, - !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX))) + !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, + !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, + !!(vxlan->flags & VXLAN_F_REMCSUM_RX))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 679e6e90aa4c..47921c291dd6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1969,7 +1969,7 @@ struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -1979,10 +1979,21 @@ struct packet_offload { struct list_head list; }; +struct udp_offload; + +struct udp_offload_callbacks { + struct sk_buff **(*gro_receive)(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff); + int (*gro_complete)(struct sk_buff *skb, + int nhoff, + struct udp_offload *uoff); +}; + struct udp_offload { __be16 port; u8 ipproto; - struct offload_callbacks callbacks; + struct udp_offload_callbacks callbacks; }; /* often modified stats are per cpu, other are shared (netdev->stats) */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index a0d80736224f..0a7443b49133 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -19,6 +19,14 @@ struct vxlanhdr { /* VXLAN header flags. */ #define VXLAN_HF_VNI 0x08000000 +#define VXLAN_HF_RCO 0x00200000 + +/* Remote checksum offload header option */ +#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ +#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ +#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ +#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) +#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) @@ -38,6 +46,7 @@ struct vxlan_sock { struct hlist_head vni_list[VNI_HASH_SIZE]; atomic_t refcnt; struct udp_offload udp_offloads; + u32 flags; }; #define VXLAN_F_LEARN 0x01 @@ -49,6 +58,8 @@ struct vxlan_sock { #define VXLAN_F_UDP_CSUM 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 +#define VXLAN_F_REMCSUM_TX 0x200 +#define VXLAN_F_REMCSUM_RX 0x400 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f7d0d2d7173a..b2723f65846f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -370,6 +370,8 @@ enum { IFLA_VXLAN_UDP_CSUM, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 2197c36f722f..3bc0cf07661c 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -174,7 +174,8 @@ drop: } static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb) + struct sk_buff *skb, + struct udp_offload *uoff) { const struct net_offload *ops; struct sk_buff **pp = NULL; @@ -195,7 +196,8 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff) +static int fou_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { const struct net_offload *ops; u8 proto = NAPI_GRO_CB(skb)->proto; @@ -254,7 +256,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, } static struct sk_buff **gue_gro_receive(struct sk_buff **head, - struct sk_buff *skb) + struct sk_buff *skb, + struct udp_offload *uoff) { const struct net_offload **offloads; const struct net_offload *ops; @@ -360,7 +363,8 @@ out: return pp; } -static int gue_gro_complete(struct sk_buff *skb, int nhoff) +static int gue_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 23744c7a9718..9568594ca2f1 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -147,7 +147,8 @@ static int geneve_hlen(struct genevehdr *gh) } static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb) + struct sk_buff *skb, + struct udp_offload *uoff) { struct sk_buff *p, **pp = NULL; struct genevehdr *gh, *gh2; @@ -211,7 +212,8 @@ out: return pp; } -static int geneve_gro_complete(struct sk_buff *skb, int nhoff) +static int geneve_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { struct genevehdr *gh; struct packet_offload *ptype; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d3e537ef6b7f..d10f6f4ead27 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -339,7 +339,8 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - pp = uo_priv->offload->callbacks.gro_receive(head, skb); + pp = uo_priv->offload->callbacks.gro_receive(head, skb, + uo_priv->offload); out_unlock: rcu_read_unlock(); @@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) if (uo_priv != NULL) { NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + err = uo_priv->offload->callbacks.gro_complete(skb, + nhoff + sizeof(struct udphdr), + uo_priv->offload); } rcu_read_unlock(); |