diff options
author | Eric Dumazet <edumazet@google.com> | 2018-04-16 10:33:38 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-04-16 18:26:37 -0400 |
commit | 93ab6cc69162775201587cc9da00d5016dc890e2 (patch) | |
tree | dad02afad888037a78dc48b9aeb66e0954897dd8 /net/ipv4/tcp.c | |
parent | 03f45c883c6f391ed4fff8292415b35bd1107519 (diff) |
tcp: implement mmap() for zero copy receive
Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.
Implement mmap() system call so that applications can avoid
copying data without complex splice() games.
Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)
Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.
If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.
Application must fallback to recvmsg() to read the problematic sequence.
mmap() wont block, regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.
An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()
On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.
Tested:
mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)
Without mmap() (tcp_mmap -s)
received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches
With mmap() on receiver (tcp_mmap -s -z)
received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c768d306b657..438fbca96cd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_set_rcvlowat); +/* When user wants to mmap X pages, we first need to perform the mapping + * before freeing any skbs in receive queue, otherwise user would be unable + * to fallback to standard recvmsg(). This happens if some data in the + * requested block is not exactly fitting in a page. + * + * We only support order-0 pages for the moment. + * mmap() on TCP is very strict, there is no point + * trying to accommodate with pathological layouts. + */ +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + unsigned int nr_pages = size >> PAGE_SHIFT; + struct page **pages_array = NULL; + u32 seq, len, offset, nr = 0; + struct sock *sk = sock->sk; + const skb_frag_t *frags; + struct tcp_sock *tp; + struct sk_buff *skb; + int ret; + + if (vma->vm_pgoff || !nr_pages) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + /* TODO: Maybe the following is not needed if pages are COW */ + vma->vm_flags &= ~VM_MAYWRITE; + + lock_sock(sk); + + ret = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + sock_rps_record_flow(sk); + + if (tcp_inq(sk) < size) { + ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; + goto out; + } + tp = tcp_sk(sk); + seq = tp->copied_seq; + /* Abort if urgent data is in the area */ + if (unlikely(tp->urg_data)) { + u32 urg_offset = tp->urg_seq - seq; + + ret = -EINVAL; + if (urg_offset < size) + goto out; + } + ret = -ENOMEM; + pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!pages_array) + goto out; + skb = tcp_recv_skb(sk, seq, &offset); + ret = -EINVAL; +skb_start: + /* We do not support anything not in page frags */ + offset -= skb_headlen(skb); + if ((int)offset < 0) + goto out; + if (skb_has_frag_list(skb)) + goto out; + len = skb->data_len - offset; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } + while (nr < nr_pages) { + if (len) { + if (len < PAGE_SIZE) + goto out; + if (frags->size != PAGE_SIZE || frags->page_offset) + goto out; + pages_array[nr++] = skb_frag_page(frags); + frags++; + len -= PAGE_SIZE; + seq += PAGE_SIZE; + continue; + } + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + goto skb_start; + } + /* OK, we have a full set of pages ready to be inserted into vma */ + for (nr = 0; nr < nr_pages; nr++) { + ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), + pages_array[nr]); + if (ret) + goto out; + } + /* operation is complete, we can 'consume' all skbs */ + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, size); + + ret = 0; +out: + release_sock(sk); + kvfree(pages_array); + return ret; +} +EXPORT_SYMBOL(tcp_mmap); + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { |