diff options
author | Eric Dumazet <edumazet@google.com> | 2015-10-02 11:43:32 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-10-03 04:32:41 -0700 |
commit | 079096f103faca2dd87342cca6f23d4b34da8871 (patch) | |
tree | fa3fb0fdc064f1611c464384e70a7a402179808f /net/ipv4/inet_diag.c | |
parent | 2feda34192a379f8b35a7c6c5826b2f23e884f32 (diff) |
tcp/dccp: install syn_recv requests into ehash table
In this patch, we insert request sockets into TCP/DCCP
regular ehash table (where ESTABLISHED and TIMEWAIT sockets
are) instead of using the per listener hash table.
ACK packets find SYN_RECV pseudo sockets without having
to find and lock the listener.
In nominal conditions, this halves pressure on listener lock.
Note that this will allow for SO_REUSEPORT refinements,
so that we can select a listener using cpu/numa affinities instead
of the prior 'consistent hash', since only SYN packets will
apply this selection logic.
We will shrink listen_sock in the following patch to ease
code review.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ying Cai <ycai@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_diag.c')
-rw-r--r-- | net/ipv4/inet_diag.c | 96 |
1 files changed, 8 insertions, 88 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0ac1d68dc8a6..ab9f8a66615d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -730,91 +730,21 @@ static void twsk_build_assert(void) #endif } -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - const struct nlattr *bc) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_entry entry; - int j, s_j, reqnum, s_reqnum; - struct listen_sock *lopt; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue)) - goto out; - - if (bc) { - entry.sport = inet->inet_num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->ir_rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - /* Note: entry.sport and entry.userlocks are already set */ - entry_fill_addrs(&entry, req_to_sk(req)); - entry.dport = ntohs(ireq->ir_rmt_port); - - if (!inet_diag_bc_run(bc, &entry)) - continue; - } - - err = inet_req_diag_fill(req_to_sk(req), skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; + u32 idiag_states = r->idiag_states; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & TCPF_LISTEN)) goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { @@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, r->id.idiag_sport) goto next_listen; - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || + if (r->id.idiag_dport || cb->args[3] > 0) - goto syn_recv; - - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { - spin_unlock_bh(&ilb->lock); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -879,7 +799,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { @@ -906,7 +826,7 @@ skip_listen_ht: goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_substate : sk->sk_state; - if (!(r->idiag_states & (1 << state))) + if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) |