diff options
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 411 |
1 files changed, 324 insertions, 87 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a692626c19e4..2e62e0d6373a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2277,51 +2277,72 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -/* - * Get next listener socket follow cur. If cur is NULL, get first socket - * starting from bucket given in st->bucket; when st->bucket is zero the - * very first socket in the hash table is returned. +static unsigned short seq_file_family(const struct seq_file *seq); + +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) +{ + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + +/* Find a non empty bucket (starting from st->bucket) + * and return the first sk from it. */ -static void *listening_get_next(struct seq_file *seq, void *cur) +static void *listening_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - struct sock *sk = cur; - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk; - if (!sk) { -get_head: - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock(&ilb->lock); - sk = sk_nulls_head(&ilb->nulls_head); - st->offset = 0; - goto get_sk; + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + if (hlist_empty(&ilb2->head)) + continue; + + spin_lock(&ilb2->lock); + inet_lhash2_for_each_icsk(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) + return sk; + } + spin_unlock(&ilb2->lock); } - ilb = &tcp_hashinfo.listening_hash[st->bucket]; + + return NULL; +} + +/* Find the next sk of "cur" within the same bucket (i.e. st->bucket). + * If "cur" is the last one in the st->bucket, + * call listening_get_first() to return the first sk of the next + * non empty bucket. + */ +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct tcp_iter_state *st = seq->private; + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk = cur; + ++st->num; ++st->offset; - sk = sk_nulls_next(sk); -get_sk: - sk_nulls_for_each_from(sk, node) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + icsk = inet_csk(sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) return sk; } - spin_unlock(&ilb->lock); - st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) - goto get_head; - return NULL; + + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + spin_unlock(&ilb2->lock); + ++st->bucket; + return listening_get_first(seq); } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2331,7 +2352,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) st->bucket = 0; st->offset = 0; - rc = listening_get_next(seq, NULL); + rc = listening_get_first(seq); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2351,15 +2372,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - void *rc = NULL; - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { @@ -2373,32 +2386,20 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if ((afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family) || - !net_eq(sock_net(sk), net)) { - continue; - } - rc = sk; - goto out; + if (seq_sk_match(seq, sk)) + return sk; } spin_unlock_bh(lock); } -out: - return rc; + + return NULL; } static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); ++st->num; ++st->offset; @@ -2406,9 +2407,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if ((afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) && - net_eq(sock_net(sk), net)) + if (seq_sk_match(seq, sk)) return sk; } @@ -2451,17 +2450,18 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; switch (st->state) { case TCP_SEQ_STATE_LISTENING: - if (st->bucket >= INET_LHTABLE_SIZE) + if (st->bucket > tcp_hashinfo.lhash2_mask) break; st->state = TCP_SEQ_STATE_LISTENING; - rc = listening_get_next(seq, NULL); - while (offset-- && rc) + rc = listening_get_first(seq); + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2472,7 +2472,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } @@ -2542,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) @@ -2687,6 +2687,15 @@ out: } #ifdef CONFIG_BPF_SYSCALL +struct bpf_tcp_iter_state { + struct tcp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__tcp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct sock_common *, sk_common); @@ -2705,16 +2714,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_tcp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct inet_connection_sock *icsk; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + icsk = inet_csk(start_sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + + return expected; +} + +static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct hlist_nulls_node *node; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + + return expected; +} + +static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + unsigned int expected; + bool resized = false; + struct sock *sk; + + /* The st->bucket is done. Directly advance to the next + * bucket instead of having the tcp_seek_last_pos() to skip + * one by one in the current bucket and eventually find out + * it has to advance to the next bucket. + */ + if (iter->st_bucket_done) { + st->offset = 0; + st->bucket++; + if (st->state == TCP_SEQ_STATE_LISTENING && + st->bucket > tcp_hashinfo.lhash2_mask) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + } + } + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + + sk = tcp_seek_last_pos(seq); + if (!sk) + return NULL; /* Done */ + + if (st->state == TCP_SEQ_STATE_LISTENING) + expected = bpf_iter_tcp_listening_batch(seq, sk); + else + expected = bpf_iter_tcp_established_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_tcp_batch(seq); + + return SEQ_START_TOKEN; +} + +static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) { + /* Keeping st->num consistent in tcp_iter_state. + * bpf_iter_tcp does not use st->num. + * meta.seq_num is used instead. + */ + st->num++; + /* Move st->offset to the next sk in the bucket such that + * the future start() will resume at st->offset in + * st->bucket. See tcp_seek_last_pos(). + */ + st->offset++; + sock_put(iter->batch[iter->cur_sk++]); + } + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_tcp_batch(seq); + + ++*pos; + /* Keeping st->last_pos consistent in tcp_iter_state. + * bpf iter does not do lseek, so st->last_pos always equals to *pos. + */ + st->last_pos = *pos; + return sk; +} + static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; + bool slow; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + if (sk_fullsock(sk)) + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + if (sk->sk_state == TCP_TIME_WAIT) { uid = 0; } else if (sk->sk_state == TCP_NEW_SYN_RECV) { @@ -2728,11 +2925,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return tcp_prog_seq_show(prog, &meta, v, uid); + ret = tcp_prog_seq_show(prog, &meta, v, uid); + +unlock: + if (sk_fullsock(sk)) + unlock_sock_fast(sk, slow); + return ret; + } static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_tcp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -2743,16 +2947,33 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) (void)tcp_prog_seq_show(prog, &meta, v, 0); } - tcp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_tcp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_tcp_seq_ops = { .show = bpf_iter_tcp_seq_show, - .start = tcp_seq_start, - .next = tcp_seq_next, + .start = bpf_iter_tcp_seq_start, + .next = bpf_iter_tcp_seq_next, .stop = bpf_iter_tcp_seq_stop, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct tcp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ + if (seq->op == &bpf_iter_tcp_seq_ops) + return AF_UNSPEC; +#endif + + /* Iterated from proc fs */ + afinfo = PDE_DATA(file_inode(seq->file)); + return afinfo->family; +} static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, @@ -2964,7 +3185,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; - spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); @@ -3003,39 +3223,55 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) +#define INIT_BATCH_SZ 16 + static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { - struct tcp_iter_state *st = priv_data; - struct tcp_seq_afinfo *afinfo; - int ret; + struct bpf_tcp_iter_state *iter = priv_data; + int err; - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) - return -ENOMEM; + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; - afinfo->family = AF_UNSPEC; - st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data, aux); - if (ret) - kfree(afinfo); - return ret; + err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; } static void bpf_iter_fini_tcp(void *priv_data) { - struct tcp_iter_state *st = priv_data; + struct bpf_tcp_iter_state *iter = priv_data; - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, - .seq_priv_size = sizeof(struct tcp_iter_state), + .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, @@ -3043,6 +3279,7 @@ static struct bpf_iter_reg tcp_reg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; |