diff options
author | David S. Miller <davem@davemloft.net> | 2015-02-11 15:12:19 -0800 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-02-11 15:12:19 -0800 |
commit | 777b3e930ac8eb1f8360b3e4f2aaf5e4abe5ed46 (patch) | |
tree | fc3c9744cddeb0bba07c0ed258de06df4eb01487 /drivers | |
parent | 13101602c4a9f653d59af9469040797bc5b361ca (diff) | |
parent | fe881ef11cf0220f118816181930494d484c4883 (diff) |
Merge branch 'rco_correctness'
Tom Herbert says:
====================
net: Fixes to remote checksum offload and CHECKSUM_PARTIAL
This patch set fixes a correctness problem with remote checksum
offload, clarifies the meaning of CHECKSUM_PARTIAL, and allows
remote checksum offload to set CHECKSUM_PARTIAL instead of
calling csum_partial and modifying the checksum.
Specifically:
- In the GRO remote checksum path, restore the checksum after
calling lower layer GRO functions. This is needed if the
packet is forwarded off host with the Remote Checksum Offload
option still present.
- Clarify meaning of CHECKSUM PARTIAL in the receive path. Only
the checksums referred to by checksum partial and any preceding
checksums can be considered verified.
- Fixes to UDP tunnel GRO complete. Need to set SKB_GSO_UDP_TUNNEL_*,
SKB_GSO_TUNNEL_REMCSUM, and skb->encapsulation for forwarding
case.
- Infrastructure to allow setting of CHECKSUM_PARTIAL in remote
checksum offload. This a potential performance benefit instead
of calling csum_partial (potentially twice, once in GRO path
and once in normal path). The downside of using CHECKSUM_PARTIAL
and not actually writing the checksum is that we aren't verifying
that the sender correctly wrote the pseudo checksum into the
checksum field, or that the start/offset values actually point
to a checksum. If the sender did not set up these fields correctly,
a packet might be accepted locally, but not accepted by a peer
when the packet is forwarded off host. Verifying these fields
seems non-trivial, and because the fields can only be incorrect
due to sender error and not corruption (outer checksum protects
against that) we'll make use of CHECKSUM_PARTIAL the default. This
behavior can be reverted as an netlink option on the encapsulation
socket.
- Change VXLAN and GUE to set CHECKSUM_PARTIAL in remote checksum
offload by default, configuration hooks can revert to using
csum_partial.
Testing:
I ran performance numbers using netperf TCP_STREAM and TCP_RR with 200
streams for GRE/GUE and for VXLAN. This compares before the fixes,
the fixes with not setting checksum partial in remote checksum offload,
and with the fixes setting checksum partial. The overall effect seems
be that using checksum partial is a slight performance win, perf
definitely shows a significant reduction of time in csum_partial on
the receive CPUs.
GRE/GUE
TCP_STREAM
Before fixes
9.22% TX CPU utilization
13.57% RX CPU utilization
9133 Mbps
Not using checksum partial
9.59% TX CPU utilization
14.95% RX CPU utilization
9132 Mbps
Using checksum partial
9.37% TX CPU utilization
13.89% RX CPU utilization
9132 Mbps
TCP_RR
Before fixes
CPU utilization
159/251/447 90/95/99% latencies
1.1462e+06 tps
Not using checksum partial
92.94% CPU utilization
158/253/445 90/95/99% latencies
1.12988e+06 tps
Using checksum partial
92.78% CPU utilization
158/250/450 90/95/99% latencies
1.15343e+06 tps
VXLAN
TCP_STREAM
Before fixes
9.24% TX CPU utilization
13.74% RX CPU utilization
9093 Mbps
Not using checksum partial
9.95% TX CPU utilization
14.66% RX CPU utilization
9094 Mbps
Using checksum partial
10.24% TX CPU utilization
13.32% RX CPU utilization
9093 Mbps
TCP_RR
Before fixes
92.91% CPU utilization
151/241/437 90/95/99% latencies
1.15939e+06 tps
Not using checksum partial
93.07% CPU utilization
156/246/425 90/95/99% latencies
1.1451e+06 tps
Using checksum partial
95.51% CPU utilization
156/249/459 90/95/99% latencies
1.17004e+06 tps
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/net/vxlan.c | 38 |
1 files changed, 25 insertions, 13 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0e57e862c399..1e0a775ea882 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -555,12 +555,13 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, - u32 data) + u32 data, struct gro_remcsum *grc, + bool nopartial) { size_t start, offset, plen; if (skb->remcsum_offload) - return vh; + return NULL; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; @@ -579,7 +580,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, return NULL; } - skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset); + skb_gro_remcsum_process(skb, (void *)vh + hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -597,6 +599,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, udp_offloads); u32 flags; + struct gro_remcsum grc; + + skb_gro_remcsum_init(&grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); @@ -614,7 +619,9 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), - ntohl(vh->vx_vni)); + ntohl(vh->vx_vni), &grc, + !!(vs->flags & + VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) goto out; @@ -637,6 +644,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, pp = eth_gro_receive(head, skb); out: + skb_gro_remcsum_cleanup(skb, &grc); NAPI_GRO_CB(skb)->flush |= flush; return pp; @@ -1150,16 +1158,10 @@ static void vxlan_igmp_leave(struct work_struct *work) } static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, - size_t hdrlen, u32 data) + size_t hdrlen, u32 data, bool nopartial) { size_t start, offset, plen; - if (skb->remcsum_offload) { - /* Already processed in GRO path */ - skb->remcsum_offload = 0; - return vh; - } - start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; offset = start + ((data & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : @@ -1172,7 +1174,8 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); - skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset); + skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset, + nopartial); return vh; } @@ -1209,7 +1212,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto drop; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { - vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni); + vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni, + !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vxh) goto drop; @@ -2438,6 +2442,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, + [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -2761,6 +2766,9 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_GBP]) vxlan->flags |= VXLAN_F_GBP; + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) + vxlan->flags |= VXLAN_F_REMCSUM_NOPARTIAL; + if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, vxlan->dst_port, vxlan->flags)) { pr_info("duplicate VNI %u\n", vni); @@ -2910,6 +2918,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; + if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL && + nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) + goto nla_put_failure; + return 0; nla_put_failure: |