summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2015-05-19 21:04:22 +0200
committerDavid S. Miller <davem@davemloft.net>2015-05-19 16:53:37 -0400
commit492135557dc090a1abb2cfbe1a412757e3ed68ab (patch)
treefb653816394b164a3ef99d95f299a2357f9fbf74 /net/ipv4
parent134e0dbe72bfa1059c610743fc8102fdef913bf8 (diff)
tcp: add rfc3168, section 6.1.1.1. fallback
This work as a follow-up of commit f7b3bec6f516 ("net: allow setting ecn via routing table") and adds RFC3168 section 6.1.1.1. fallback for outgoing ECN connections. In other words, this work adds a retry with a non-ECN setup SYN packet, as suggested from the RFC on the first timeout: [...] A host that receives no reply to an ECN-setup SYN within the normal SYN retransmission timeout interval MAY resend the SYN and any subsequent SYN retransmissions with CWR and ECE cleared. [...] Schematic client-side view when assuming the server is in tcp_ecn=2 mode, that is, Linux default since 2009 via commit 255cac91c3c9 ("tcp: extend ECN sysctl to allow server-side only ECN"): 1) Normal ECN-capable path: SYN ECE CWR -----> <----- SYN ACK ECE ACK -----> 2) Path with broken middlebox, when client has fallback: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN -----> <----- SYN ACK ACK -----> In case we would not have the fallback implemented, the middlebox drop point would basically end up as: SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) SYN ECE CWR ----X crappy middlebox drops packet (timeout, rtx) In any case, it's rather a smaller percentage of sites where there would occur such additional setup latency: it was found in end of 2014 that ~56% of IPv4 and 65% of IPv6 servers of Alexa 1 million list would negotiate ECN (aka tcp_ecn=2 default), 0.42% of these webservers will fail to connect when trying to negotiate with ECN (tcp_ecn=1) due to timeouts, which the fallback would mitigate with a slight latency trade-off. Recent related paper on this topic: Brian Trammell, Mirja Kühlewind, Damiano Boppart, Iain Learmonth, Gorry Fairhurst, and Richard Scheffenegger: "Enabling Internet-Wide Deployment of Explicit Congestion Notification." Proc. PAM 2015, New York. http://ecn.ethz.ch/ecn-pam15.pdf Thus, when net.ipv4.tcp_ecn=1 is being set, the patch will perform RFC3168, section 6.1.1.1. fallback on timeout. For users explicitly not wanting this which can be in DC use case, we add a net.ipv4.tcp_ecn_fallback knob that allows for disabling the fallback. tp->ecn_flags are not being cleared in tcp_ecn_clear_syn() on output, but rather we let tcp_ecn_rcv_synack() take that over on input path in case a SYN ACK ECE was delayed. Thus a spurious SYN retransmission will not prevent ECN being negotiated eventually in that case. Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Mirja Kühlewind <mirja.kuehlewind@tik.ee.ethz.ch> Signed-off-by: Brian Trammell <trammell@tik.ee.ethz.ch> Cc: Eric Dumazet <edumazet@google.com> Cc: Dave That <dave.taht@gmail.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv4/tcp_output.c13
3 files changed, 24 insertions, 1 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c3852a7ff3c7..841de32f1fee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -821,6 +821,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_ecn_fallback",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 91cb4768a860..0cc4b5a630cd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2411,12 +2411,15 @@ static int __net_init tcp_sk_init(struct net *net)
goto fail;
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
+
net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn_fallback = 1;
+
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
- return 0;
+ return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 08c2cc40b26d..e29d43b5a0bb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -350,6 +350,15 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
}
}
+static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+ if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ /* tp->ecn_flags are cleared at a later point in time when
+ * SYN ACK is ultimatively being received.
+ */
+ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
+}
+
static void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
struct sock *sk)
@@ -2615,6 +2624,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
}
}
+ /* RFC3168, section 6.1.1.1. ECN fallback */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+
tcp_retrans_try_collapse(sk, skb, cur_mss);
/* Make a copy, if the first transmission SKB clone we made