From d8e18a516f8f67404c0d21af8c93d0474fba0876 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 22 Jul 2019 20:08:26 -0700 Subject: net: Use skb accessors in network core In preparation for unifying the skb_frag and bio_vec, use the fine accessors which already exist and use skb_frag_t instead of struct skb_frag_struct. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 776905899ac0..f62f0e7e3cdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1776,19 +1776,21 @@ static int tcp_zerocopy_receive(struct sock *sk, break; frags = skb_shinfo(skb)->frags; while (offset) { - if (frags->size > offset) + if (skb_frag_size(frags) > offset) goto out; - offset -= frags->size; + offset -= skb_frag_size(frags); frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) { + if (skb_frag_size(frags) != PAGE_SIZE || frags->page_offset) { int remaining = zc->recv_skip_hint; + int size = skb_frag_size(frags); - while (remaining && (frags->size != PAGE_SIZE || + while (remaining && (size != PAGE_SIZE || frags->page_offset)) { - remaining -= frags->size; + remaining -= size; frags++; + size = skb_frag_size(frags); } zc->recv_skip_hint -= remaining; break; @@ -3781,7 +3783,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; for (i = 0; i < shi->nr_frags; ++i) { - const struct skb_frag_struct *f = &shi->frags[i]; + const skb_frag_t *f = &shi->frags[i]; unsigned int offset = f->page_offset; struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); -- cgit v1.2.3 From b54c9d5bd6e38edac9ce3a3f95f14a1292b5268d Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Tue, 30 Jul 2019 07:40:33 -0700 Subject: net: Use skb_frag_off accessors Use accessor functions for skb fragment's page_offset instead of direct references, in preparation for bvec conversion. Signed-off-by: Jonathan Lemon Signed-off-by: David S. Miller --- drivers/atm/eni.c | 2 +- drivers/hsi/clients/ssi_protocol.c | 2 +- drivers/infiniband/hw/hfi1/vnic_sdma.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 3 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 +- drivers/net/ethernet/chelsio/cxgb3/sge.c | 2 +- drivers/net/ethernet/emulex/benet/be_main.c | 12 ++--- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/iavf/iavf_txrx.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +- drivers/net/ethernet/jme.c | 4 +- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 6 +-- drivers/net/ethernet/sfc/tx.c | 2 +- drivers/net/ethernet/sun/cassini.c | 8 ++-- drivers/net/ethernet/sun/niu.c | 2 +- drivers/net/ethernet/sun/sunvnet_common.c | 4 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 4 +- drivers/net/thunderbolt.c | 2 +- drivers/net/usb/usbnet.c | 2 +- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- drivers/net/xen-netback/netback.c | 6 +-- drivers/net/xen-netfront.c | 8 ++-- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 2 +- drivers/scsi/fcoe/fcoe.c | 3 +- drivers/scsi/fcoe/fcoe_transport.c | 2 +- drivers/scsi/qedf/qedf_main.c | 2 +- drivers/staging/unisys/visornic/visornic_main.c | 2 +- drivers/target/iscsi/cxgbit/cxgbit_target.c | 4 +- net/appletalk/ddp.c | 4 +- net/core/datagram.c | 6 +-- net/core/dev.c | 2 +- net/core/pktgen.c | 2 +- net/core/skbuff.c | 54 +++++++++++----------- net/ipv4/tcp.c | 6 +-- net/ipv4/tcp_output.c | 2 +- net/kcm/kcmsock.c | 2 +- net/tls/tls_device.c | 8 ++-- net/tls/tls_device_fallback.c | 2 +- net/xfrm/xfrm_ipcomp.c | 2 +- 44 files changed, 100 insertions(+), 98 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 79b718430cd1..b23d1e4bad33 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -1136,7 +1136,7 @@ DPRINTK("doing direct send\n"); /* @@@ well, this doesn't work anyway */ else put_dma(tx->index,eni_dev->dma,&j,(unsigned long) skb_frag_page(&skb_shinfo(skb)->frags[i]) + - skb_shinfo(skb)->frags[i].page_offset, + skb_frag_off(&skb_shinfo(skb)->frags[i]), skb_frag_size(&skb_shinfo(skb)->frags[i])); } if (skb->len & 3) { diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c index c9e3f928b93d..0253e76f1df2 100644 --- a/drivers/hsi/clients/ssi_protocol.c +++ b/drivers/hsi/clients/ssi_protocol.c @@ -182,7 +182,7 @@ static void ssip_skb_to_msg(struct sk_buff *skb, struct hsi_msg *msg) BUG_ON(!sg); frag = &skb_shinfo(skb)->frags[i]; sg_set_page(sg, skb_frag_page(frag), skb_frag_size(frag), - frag->page_offset); + skb_frag_off(frag)); } } diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 05a140504a99..7d90b900131b 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -108,7 +108,7 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, ret = sdma_txadd_page(sde->dd, &tx->txreq, skb_frag_page(frag), - frag->page_offset, + skb_frag_off(frag), skb_frag_size(frag)); if (unlikely(ret)) goto bail_txadd; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 78fa777c87b1..c332b4761816 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -293,7 +293,8 @@ int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; mapping[i + off] = ib_dma_map_page(ca, skb_frag_page(frag), - frag->page_offset, skb_frag_size(frag), + skb_frag_off(frag), + skb_frag_size(frag), DMA_TO_DEVICE); if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) goto partial_error; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ac61c9352535..c23fbb34f0e9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -957,7 +957,7 @@ static struct sk_buff *bnxt_rx_page_skb(struct bnxt *bp, frag = &skb_shinfo(skb)->frags[0]; skb_frag_size_sub(frag, payload); - frag->page_offset += payload; + skb_frag_off_add(frag, payload); skb->data_len -= payload; skb->tail += payload; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index c0266a87794c..4ab57d33a87e 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -1594,7 +1594,7 @@ int nicvf_sq_append_skb(struct nicvf *nic, struct snd_queue *sq, size = skb_frag_size(frag); dma_addr = dma_map_page_attrs(&nic->pdev->dev, skb_frag_page(frag), - frag->page_offset, size, + skb_frag_off(frag), size, DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); if (dma_mapping_error(&nic->pdev->dev, dma_addr)) { diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index 310a232e00f0..6dabbf1502c7 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -2182,7 +2182,7 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs, rx_frag += nr_frags; __skb_frag_set_page(rx_frag, sd->pg_chunk.page); - rx_frag->page_offset = sd->pg_chunk.offset + offset; + skb_frag_off_set(rx_frag, sd->pg_chunk.offset + offset); skb_frag_size_set(rx_frag, len); skb->len += len; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index e00a94a03879..1c9883019767 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2346,8 +2346,8 @@ static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb, memcpy(skb->data, start, hdr_len); skb_shinfo(skb)->nr_frags = 1; skb_frag_set_page(skb, 0, page_info->page); - skb_shinfo(skb)->frags[0].page_offset = - page_info->page_offset + hdr_len; + skb_frag_off_set(&skb_shinfo(skb)->frags[0], + page_info->page_offset + hdr_len); skb_frag_size_set(&skb_shinfo(skb)->frags[0], curr_frag_len - hdr_len); skb->data_len = curr_frag_len - hdr_len; @@ -2372,8 +2372,8 @@ static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb, /* Fresh page */ j++; skb_frag_set_page(skb, j, page_info->page); - skb_shinfo(skb)->frags[j].page_offset = - page_info->page_offset; + skb_frag_off_set(&skb_shinfo(skb)->frags[j], + page_info->page_offset); skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); skb_shinfo(skb)->nr_frags++; } else { @@ -2454,8 +2454,8 @@ static void be_rx_compl_process_gro(struct be_rx_obj *rxo, /* First frag or Fresh page */ j++; skb_frag_set_page(skb, j, page_info->page); - skb_shinfo(skb)->frags[j].page_offset = - page_info->page_offset; + skb_frag_off_set(&skb_shinfo(skb)->frags[j], + page_info->page_offset); skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0); } else { put_page(page_info->page); diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 5fad73b2e123..3981c06f082f 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -501,7 +501,7 @@ fs_enet_start_xmit(struct sk_buff *skb, struct net_device *dev) nr_frags = skb_shinfo(skb)->nr_frags; frag = skb_shinfo(skb)->frags; for (i = 0; i < nr_frags; i++, frag++) { - if (!IS_ALIGNED(frag->page_offset, 4)) { + if (!IS_ALIGNED(skb_frag_off(frag), 4)) { is_aligned = 0; break; } diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3da680073265..81a05ea38237 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1485,7 +1485,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) memcpy(dst + cur, page_address(skb_frag_page(frag)) + - frag->page_offset, skb_frag_size(frag)); + skb_frag_off(frag), skb_frag_size(frag)); cur += skb_frag_size(frag); } } else { diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index f162252f01b5..e3f29dc8b290 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3306,7 +3306,7 @@ bool __i40e_chk_linearize(struct sk_buff *skb) * descriptor associated with the fragment. */ if (stale_size > I40E_MAX_DATA_PER_TXD) { - int align_pad = -(stale->page_offset) & + int align_pad = -(skb_frag_off(stale)) & (I40E_MAX_READ_REQ_SIZE - 1); sum -= align_pad; diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index fae7cd1c618a..7a30d5d5ef53 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -2205,7 +2205,7 @@ bool __iavf_chk_linearize(struct sk_buff *skb) * descriptor associated with the fragment. */ if (stale_size > IAVF_MAX_DATA_PER_TXD) { - int align_pad = -(stale->page_offset) & + int align_pad = -(skb_frag_off(stale)) & (IAVF_MAX_READ_REQ_SIZE - 1); sum -= align_pad; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index e12d23d1fa64..dc7b128c780e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1807,7 +1807,7 @@ static void ixgbe_pull_tail(struct ixgbe_ring *rx_ring, /* update all of the pointers */ skb_frag_size_sub(frag, pull_len); - frag->page_offset += pull_len; + skb_frag_off_add(frag, pull_len); skb->data_len -= pull_len; skb->tail += pull_len; } @@ -1844,7 +1844,7 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring, dma_sync_single_range_for_cpu(rx_ring->dev, IXGBE_CB(skb)->dma, - frag->page_offset, + skb_frag_off(frag), skb_frag_size(frag), DMA_FROM_DEVICE); } diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 9c3ab00643bd..6d52cf5ce20e 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -2040,8 +2040,8 @@ jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx) ctxbi = txbi + ((idx + i + 2) & (mask)); ret = jme_fill_tx_map(jme->pdev, ctxdesc, ctxbi, - skb_frag_page(frag), - frag->page_offset, skb_frag_size(frag), hidma); + skb_frag_page(frag), skb_frag_off(frag), + skb_frag_size(frag), hidma); if (ret) { jme_drop_tx_map(jme, idx, i); goto out; diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 88ea5ac83c93..82ea55ae5053 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -659,7 +659,7 @@ static inline unsigned int has_tiny_unaligned_frags(struct sk_buff *skb) for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) { const skb_frag_t *fragp = &skb_shinfo(skb)->frags[frag]; - if (skb_frag_size(fragp) <= 8 && fragp->page_offset & 7) + if (skb_frag_size(fragp) <= 8 && skb_frag_off(fragp) & 7) return 1; } diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 9ead6ecb7586..99eaadba555f 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -1306,8 +1306,8 @@ myri10ge_vlan_rx(struct net_device *dev, void *addr, struct sk_buff *skb) skb->len -= VLAN_HLEN; skb->data_len -= VLAN_HLEN; frag = skb_shinfo(skb)->frags; - frag->page_offset += VLAN_HLEN; - skb_frag_size_set(frag, skb_frag_size(frag) - VLAN_HLEN); + skb_frag_off_add(frag, VLAN_HLEN); + skb_frag_size_sub(frag, VLAN_HLEN); } } @@ -1364,7 +1364,7 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) } /* remove padding */ - rx_frags[0].page_offset += MXGEFW_PAD; + skb_frag_off_add(&rx_frags[0], MXGEFW_PAD); skb_frag_size_sub(&rx_frags[0], MXGEFW_PAD); len -= MXGEFW_PAD; diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 31ec56091a5d..65e81ec1b314 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -274,7 +274,7 @@ static void efx_skb_copy_bits_to_pio(struct efx_nic *efx, struct sk_buff *skb, vaddr = kmap_atomic(skb_frag_page(f)); - efx_memcpy_toio_aligned_cb(efx, piobuf, vaddr + f->page_offset, + efx_memcpy_toio_aligned_cb(efx, piobuf, vaddr + skb_frag_off(f), skb_frag_size(f), copy_buf); kunmap_atomic(vaddr); } diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index 6fc05c106afc..c91876f8c536 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -2034,7 +2034,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc, __skb_frag_set_page(frag, page->buffer); __skb_frag_ref(frag); - frag->page_offset = off; + skb_frag_off_set(frag, off); skb_frag_size_set(frag, hlen - swivel); /* any more data? */ @@ -2058,7 +2058,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc, __skb_frag_set_page(frag, page->buffer); __skb_frag_ref(frag); - frag->page_offset = 0; + skb_frag_off_set(frag, 0); skb_frag_size_set(frag, hlen); RX_USED_ADD(page, hlen + cp->crc_size); } @@ -2816,7 +2816,7 @@ static inline int cas_xmit_tx_ringN(struct cas *cp, int ring, mapping = skb_frag_dma_map(&cp->pdev->dev, fragp, 0, len, DMA_TO_DEVICE); - tabort = cas_calc_tabort(cp, fragp->page_offset, len); + tabort = cas_calc_tabort(cp, skb_frag_off(fragp), len); if (unlikely(tabort)) { void *addr; @@ -2827,7 +2827,7 @@ static inline int cas_xmit_tx_ringN(struct cas *cp, int ring, addr = cas_page_map(skb_frag_page(fragp)); memcpy(tx_tiny_buf(cp, ring, entry), - addr + fragp->page_offset + len - tabort, + addr + skb_frag_off(fragp) + len - tabort, tabort); cas_page_unmap(addr); mapping = tx_tiny_map(cp, ring, entry, tentry); diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 0bc5863bffeb..f5fd1f3c07cc 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6695,7 +6695,7 @@ static netdev_tx_t niu_start_xmit(struct sk_buff *skb, len = skb_frag_size(frag); mapping = np->ops->map_page(np->device, skb_frag_page(frag), - frag->page_offset, len, + skb_frag_off(frag), len, DMA_TO_DEVICE); rp->tx_buffs[prod].skb = NULL; diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c index baa3088b475c..646e67236b65 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.c +++ b/drivers/net/ethernet/sun/sunvnet_common.c @@ -1088,7 +1088,7 @@ static inline int vnet_skb_map(struct ldc_channel *lp, struct sk_buff *skb, vaddr = kmap_atomic(skb_frag_page(f)); blen = skb_frag_size(f); blen += 8 - (blen & 7); - err = ldc_map_single(lp, vaddr + f->page_offset, + err = ldc_map_single(lp, vaddr + skb_frag_off(f), blen, cookies + nc, ncookies - nc, map_perm); kunmap_atomic(vaddr); @@ -1124,7 +1124,7 @@ static inline struct sk_buff *vnet_skb_shape(struct sk_buff *skb, int ncookies) for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - docopy |= f->page_offset & 7; + docopy |= skb_frag_off(f) & 7; } if (((unsigned long)skb->data & 7) != VNET_PACKET_SKIP || skb_tailroom(skb) < pad || diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 642843945031..1b2702f74455 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1116,7 +1116,7 @@ netcp_tx_map_skb(struct sk_buff *skb, struct netcp_intf *netcp) for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); - u32 page_offset = frag->page_offset; + u32 page_offset = skb_frag_off(frag); u32 buf_len = skb_frag_size(frag); dma_addr_t desc_dma; u32 desc_dma_32; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3544e1991579..86884c863013 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -435,7 +435,7 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, skb_frag_t *frag = skb_shinfo(skb)->frags + i; slots_used += fill_pg_buf(skb_frag_page(frag), - frag->page_offset, + skb_frag_off(frag), skb_frag_size(frag), &pb[slots_used]); } return slots_used; @@ -449,7 +449,7 @@ static int count_skb_frag_slots(struct sk_buff *skb) for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; unsigned long size = skb_frag_size(frag); - unsigned long offset = frag->page_offset; + unsigned long offset = skb_frag_off(frag); /* Skip unused frames from start of page */ offset &= ~PAGE_MASK; diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c index fcf31335a8b6..dacb4f680fd4 100644 --- a/drivers/net/thunderbolt.c +++ b/drivers/net/thunderbolt.c @@ -1005,7 +1005,7 @@ static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num, const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num]; *len = skb_frag_size(frag); - return kmap_atomic(skb_frag_page(frag)) + frag->page_offset; + return kmap_atomic(skb_frag_page(frag)) + skb_frag_off(frag); } static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb, diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index ace7ffaf3913..58952a79b05f 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1328,7 +1328,7 @@ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb) total_len += skb_frag_size(f); sg_set_page(&urb->sg[i + s], skb_frag_page(f), skb_frag_size(f), - f->page_offset); + skb_frag_off(f)); } urb->transfer_buffer_length = total_len; diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 03feaeae89cd..216acf37ca7c 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -662,7 +662,7 @@ vmxnet3_append_frag(struct sk_buff *skb, struct Vmxnet3_RxCompDesc *rcd, BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS); __skb_frag_set_page(frag, rbi->page); - frag->page_offset = 0; + skb_frag_off_set(frag, 0); skb_frag_size_set(frag, rcd->len); skb->data_len += rcd->len; skb->truesize += PAGE_SIZE; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index a96c5c2a2c5a..3ef07b63613e 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -136,12 +136,12 @@ static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf) static u16 frag_get_pending_idx(skb_frag_t *frag) { - return (u16)frag->page_offset; + return (u16)skb_frag_off(frag); } static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx) { - frag->page_offset = pending_idx; + skb_frag_off_set(frag, pending_idx); } static inline pending_ring_idx_t pending_index(unsigned i) @@ -1068,7 +1068,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s offset += len; __skb_frag_set_page(&frags[i], page); - frags[i].page_offset = 0; + skb_frag_off_set(&frags[i], 0); skb_frag_size_set(&frags[i], len); } diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8d33970a2950..b930d5f95222 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -531,7 +531,7 @@ static int xennet_count_skb_slots(struct sk_buff *skb) for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; unsigned long size = skb_frag_size(frag); - unsigned long offset = frag->page_offset; + unsigned long offset = skb_frag_off(frag); /* Skip unused frames from start of page */ offset &= ~PAGE_MASK; @@ -674,8 +674,8 @@ static netdev_tx_t xennet_start_xmit(struct sk_buff *skb, struct net_device *dev /* Requests for all the frags. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - tx = xennet_make_txreqs(queue, tx, skb, - skb_frag_page(frag), frag->page_offset, + tx = xennet_make_txreqs(queue, tx, skb, skb_frag_page(frag), + skb_frag_off(frag), skb_frag_size(frag)); } @@ -1040,7 +1040,7 @@ err: if (NETFRONT_SKB_CB(skb)->pull_to > RX_COPY_THRESHOLD) NETFRONT_SKB_CB(skb)->pull_to = RX_COPY_THRESHOLD; - skb_shinfo(skb)->frags[0].page_offset = rx->offset; + skb_frag_off_set(&skb_shinfo(skb)->frags[0], rx->offset); skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status); skb->data_len = rx->status; skb->len += rx->status; diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 7796799bf04a..9ff9429395eb 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -346,7 +346,7 @@ static int bnx2fc_xmit(struct fc_lport *lport, struct fc_frame *fp) return -ENOMEM; } frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1]; - cp = kmap_atomic(skb_frag_page(frag)) + frag->page_offset; + cp = kmap_atomic(skb_frag_page(frag)) + skb_frag_off(frag); } else { cp = skb_put(skb, tlen); } diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 00dd47bcbb1e..587d4bbb7d22 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1522,8 +1522,7 @@ static int fcoe_xmit(struct fc_lport *lport, struct fc_frame *fp) return -ENOMEM; } frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1]; - cp = kmap_atomic(skb_frag_page(frag)) - + frag->page_offset; + cp = kmap_atomic(skb_frag_page(frag)) + skb_frag_off(frag); } else { cp = skb_put(skb, tlen); } diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c index d0550384cc38..a20ddc301c89 100644 --- a/drivers/scsi/fcoe/fcoe_transport.c +++ b/drivers/scsi/fcoe/fcoe_transport.c @@ -318,7 +318,7 @@ u32 fcoe_fc_crc(struct fc_frame *fp) for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { frag = &skb_shinfo(skb)->frags[i]; - off = frag->page_offset; + off = skb_frag_off(frag); len = skb_frag_size(frag); while (len > 0) { clen = min(len, PAGE_SIZE - (off & ~PAGE_MASK)); diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index a42babde036d..42542720962f 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -1077,7 +1077,7 @@ static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp) return -ENOMEM; } frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1]; - cp = kmap_atomic(skb_frag_page(frag)) + frag->page_offset; + cp = kmap_atomic(skb_frag_page(frag)) + skb_frag_off(frag); } else { cp = skb_put(skb, tlen); } diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c index b889b04a6e25..6fa7726185de 100644 --- a/drivers/staging/unisys/visornic/visornic_main.c +++ b/drivers/staging/unisys/visornic/visornic_main.c @@ -284,7 +284,7 @@ static int visor_copy_fragsinfo_from_skb(struct sk_buff *skb, for (frag = 0; frag < numfrags; frag++) { count = add_physinfo_entries(page_to_pfn( skb_frag_page(&skb_shinfo(skb)->frags[frag])), - skb_shinfo(skb)->frags[frag].page_offset, + skb_frag_off(&skb_shinfo(skb)->frags[frag]), skb_frag_size(&skb_shinfo(skb)->frags[frag]), count, frags_max, frags); /* add_physinfo_entries only returns diff --git a/drivers/target/iscsi/cxgbit/cxgbit_target.c b/drivers/target/iscsi/cxgbit/cxgbit_target.c index c25315431ad0..fcdc4211e3c2 100644 --- a/drivers/target/iscsi/cxgbit/cxgbit_target.c +++ b/drivers/target/iscsi/cxgbit/cxgbit_target.c @@ -900,7 +900,7 @@ cxgbit_handle_immediate_data(struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr, sg_init_table(&ccmd->sg, 1); sg_set_page(&ccmd->sg, skb_frag_page(dfrag), - skb_frag_size(dfrag), dfrag->page_offset); + skb_frag_size(dfrag), skb_frag_off(dfrag)); get_page(skb_frag_page(dfrag)); cmd->se_cmd.t_data_sg = &ccmd->sg; @@ -1403,7 +1403,7 @@ static void cxgbit_lro_skb_dump(struct sk_buff *skb) pdu_cb->ddigest, pdu_cb->frags); for (i = 0; i < ssi->nr_frags; i++) pr_info("skb 0x%p, frag %d, off %u, sz %u.\n", - skb, i, ssi->frags[i].page_offset, + skb, i, skb_frag_off(&ssi->frags[i]), skb_frag_size(&ssi->frags[i])); } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index a8cb6b2e20c1..4072e9d394d6 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -953,8 +953,8 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap_atomic(skb_frag_page(frag)); - sum = atalk_sum_partial(vaddr + frag->page_offset + - offset - start, copy, sum); + sum = atalk_sum_partial(vaddr + skb_frag_off(frag) + + offset - start, copy, sum); kunmap_atomic(vaddr); if (!(len -= copy)) diff --git a/net/core/datagram.c b/net/core/datagram.c index 45a162ef5e02..4cc8dc5db2b7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -442,8 +442,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; - n = cb(vaddr + frag->page_offset + - offset - start, copy, data, to); + n = cb(vaddr + skb_frag_off(frag) + offset - start, + copy, data, to); kunmap(page); offset += n; if (n != copy) @@ -573,7 +573,7 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, if (copy > len) copy = len; copied = copy_page_from_iter(skb_frag_page(frag), - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, from); if (copied != copy) goto fault; diff --git a/net/core/dev.c b/net/core/dev.c index fc676b2610e3..e2a11c62197b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5481,7 +5481,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) skb->data_len -= grow; skb->tail += grow; - pinfo->frags[0].page_offset += grow; + skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bb9915291644..c5dbdc87342a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2652,7 +2652,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } get_page(pkt_dev->page); skb_frag_set_page(skb, i, pkt_dev->page); - skb_shinfo(skb)->frags[i].page_offset = 0; + skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0); /*last fragment, fill rest of data*/ if (i == (frags - 1)) skb_frag_size_set(&skb_shinfo(skb)->frags[i], diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0b788df5a75b..ea8e8d332d85 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -785,7 +785,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) struct page *p; u8 *vaddr; - skb_frag_foreach_page(frag, frag->page_offset, + skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { seg_len = min_t(int, p_len, len); @@ -1375,7 +1375,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) struct page *p; u8 *vaddr; - skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f), + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { u32 copy, done = 0; vaddr = kmap_atomic(p); @@ -2144,10 +2144,12 @@ pull_pages: skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; + + *frag = skb_shinfo(skb)->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + skb_frag_off_add(frag, eat); + skb_frag_size_sub(frag, eat); if (!i) goto end; eat = 0; @@ -2219,7 +2221,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) copy = len; skb_frag_foreach_page(f, - f->page_offset + offset - start, + skb_frag_off(f) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(to + copied, vaddr + p_off, p_len); @@ -2395,7 +2397,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(skb_frag_page(f), - f->page_offset, skb_frag_size(f), + skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) return true; } @@ -2498,7 +2500,7 @@ do_frag_list: while (slen) { ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - frag->page_offset + offset, + skb_frag_off(frag) + offset, slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2580,7 +2582,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); memcpy(vaddr + p_off, from + copied, p_len); @@ -2660,7 +2662,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = INDIRECT_CALL_1(ops->update, @@ -2759,7 +2761,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, copy = len; skb_frag_foreach_page(frag, - frag->page_offset + offset - start, + skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); csum2 = csum_partial_copy_nocheck(vaddr + p_off, @@ -3234,7 +3236,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, * 2. Split is accurately. We make this. */ skb_frag_ref(skb, i); - skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; @@ -3316,7 +3318,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) */ if (!to || !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), - fragfrom->page_offset)) { + skb_frag_off(fragfrom))) { merge = -1; } else { merge = to - 1; @@ -3333,7 +3335,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) skb_frag_size_add(fragto, shiftlen); skb_frag_size_sub(fragfrom, shiftlen); - fragfrom->page_offset += shiftlen; + skb_frag_off_add(fragfrom, shiftlen); goto onlymerged; } @@ -3364,11 +3366,11 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) } else { __skb_frag_ref(fragfrom); - fragto->bv_page = fragfrom->bv_page; - fragto->page_offset = fragfrom->page_offset; + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); - fragfrom->page_offset += todo; + skb_frag_off_add(fragfrom, todo); skb_frag_size_sub(fragfrom, todo); todo = 0; @@ -3493,7 +3495,7 @@ next_skb: if (!st->frag_data) st->frag_data = kmap_atomic(skb_frag_page(frag)); - *data = (u8 *) st->frag_data + frag->page_offset + + *data = (u8 *) st->frag_data + skb_frag_off(frag) + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3630,8 +3632,8 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) page = virt_to_head_page(frag_skb->head); __skb_frag_set_page(&head_frag, page); - head_frag.page_offset = frag_skb->data - - (unsigned char *)page_address(page); + skb_frag_off_set(&head_frag, frag_skb->data - + (unsigned char *)page_address(page)); skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); return head_frag; } @@ -3875,7 +3877,7 @@ normal: size = skb_frag_size(nskb_frag); if (pos < offset) { - nskb_frag->page_offset += offset - pos; + skb_frag_off_add(nskb_frag, offset - pos); skb_frag_size_sub(nskb_frag, offset - pos); } @@ -3996,7 +3998,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) *--frag = *--frag2; } while (--i); - frag->page_offset += offset; + skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ @@ -4026,7 +4028,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; __skb_frag_set_page(frag, page); - frag->page_offset = first_offset; + skb_frag_off_set(frag, first_offset); skb_frag_size_set(frag, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); @@ -4042,7 +4044,7 @@ merge: if (offset > headlen) { unsigned int eat = offset - headlen; - skbinfo->frags[0].page_offset += eat; + skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; @@ -4167,7 +4169,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, if (copy > len) copy = len; sg_set_page(&sg[elt], skb_frag_page(frag), copy, - frag->page_offset+offset-start); + skb_frag_off(frag) + offset - start); elt++; if (!(len -= copy)) return elt; @@ -5838,7 +5840,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, * where splitting is expensive. * 2. Split is accurately. We make this. */ - shinfo->frags[0].page_offset += off - pos; + skb_frag_off_add(&shinfo->frags[0], off - pos); skb_frag_size_sub(&shinfo->frags[0], off - pos); } skb_frag_ref(skb, i); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f62f0e7e3cdd..a0a66321c0ee 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1782,12 +1782,12 @@ static int tcp_zerocopy_receive(struct sock *sk, frags++; } } - if (skb_frag_size(frags) != PAGE_SIZE || frags->page_offset) { + if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { int remaining = zc->recv_skip_hint; int size = skb_frag_size(frags); while (remaining && (size != PAGE_SIZE || - frags->page_offset)) { + skb_frag_off(frags))) { remaining -= size; frags++; size = skb_frag_size(frags); @@ -3784,7 +3784,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; - unsigned int offset = f->page_offset; + unsigned int offset = skb_frag_off(f); struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e4afc48d7bb..e6d02e05bb1c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1402,7 +1402,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { - shinfo->frags[k].page_offset += eat; + skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 05f63c4300e9..4ff75c3a8d6e 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -642,7 +642,7 @@ do_frag: ret = kernel_sendpage(psock->sk->sk_socket, skb_frag_page(frag), - frag->page_offset + frag_offset, + skb_frag_off(frag) + frag_offset, skb_frag_size(frag) - frag_offset, MSG_DONTWAIT); if (ret <= 0) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 4ec8a06fa5d1..d184230665eb 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -244,12 +244,12 @@ static void tls_append_frag(struct tls_record_info *record, frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && - frag->page_offset + skb_frag_size(frag) == pfrag->offset) { + skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; __skb_frag_set_page(frag, pfrag->page); - frag->page_offset = pfrag->offset; + skb_frag_off_set(frag, pfrag->offset); skb_frag_size_set(frag, size); ++record->num_frags; get_page(pfrag->page); @@ -301,7 +301,7 @@ static int tls_push_record(struct sock *sk, frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), - skb_frag_size(frag), frag->page_offset); + skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } @@ -324,7 +324,7 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, frag = &record->frags[0]; __skb_frag_set_page(frag, pfrag->page); - frag->page_offset = pfrag->offset; + skb_frag_off_set(frag, pfrag->offset); skb_frag_size_set(frag, prepend_size); get_page(pfrag->page); diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 9070d68a92a4..28895333701e 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -273,7 +273,7 @@ static int fill_sg_in(struct scatterlist *sg_in, __skb_frag_ref(frag); sg_set_page(sg_in + i, skb_frag_page(frag), - skb_frag_size(frag), frag->page_offset); + skb_frag_size(frag), skb_frag_off(frag)); remaining -= skb_frag_size(frag); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 32c364d3bfb3..4d422447aadc 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -85,7 +85,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) if (dlen < len) len = dlen; - frag->page_offset = 0; + skb_frag_off_set(frag, 0); skb_frag_size_set(frag, len); memcpy(skb_frag_address(frag), scratch, len); -- cgit v1.2.3 From 965112785e4bd4355262c6c5a32ea8f349adb401 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:13 -0700 Subject: tcp: tcp_syn_flood_action read port from socket This allows us to call this function before an SKB has been allocated. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp_input.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c21e8a22fb3b..8892df6de1d4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6422,9 +6422,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, - const struct sk_buff *skb, - const char *proto) +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; @@ -6444,7 +6442,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); + proto, sk->sk_num, msg); return want_cookie; } @@ -6487,7 +6485,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, */ if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); if (!want_cookie) goto drop; } -- cgit v1.2.3 From 9349d600fb6a1ca0aaeb515523e1bb5409483d76 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Mon, 29 Jul 2019 09:59:14 -0700 Subject: tcp: add skb-less helpers to retrieve SYN cookie This patch allows generation of a SYN cookie before an SKB has been allocated, as is the case at XDP. Signed-off-by: Petar Penkov Reviewed-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 10 +++++++ net/ipv4/tcp_input.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 15 +++++++++++ net/ipv6/tcp_ipv6.c | 15 +++++++++++ 4 files changed, 113 insertions(+) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index e5cf514ba118..fb7e153aecc5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -414,6 +414,16 @@ void tcp_parse_options(const struct net *net, const struct sk_buff *skb, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +/* + * BPF SKB-less helpers + */ +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie); +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8892df6de1d4..706cbb3b2986 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3782,6 +3782,49 @@ static void smc_parse_options(const struct tcphdr *th, #endif } +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped + * value on success. + */ +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) +{ + const unsigned char *ptr = (const unsigned char *)(th + 1); + int length = (th->doff * 4) - sizeof(struct tcphdr); + u16 mss = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return mss; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + if (length < 2) + return mss; + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return mss; + if (opsize > length) + return mss; /* fail on partial options */ + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (user_mss && user_mss < in_mss) + in_mss = user_mss; + mss = in_mss; + } + } + ptr += opsize - 2; + length -= opsize; + } + } + return mss; +} + /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. @@ -6464,6 +6507,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk, } } +/* If a SYN cookie is required and supported, returns a clamped MSS value to be + * used for SYN cookie generation. + */ +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 mss; + + if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && + !inet_csk_reqsk_queue_is_full(sk)) + return 0; + + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) + return 0; + + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + return 0; + } + + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + if (!mss) + mss = af_ops->mss_clamp; + + return mss; +} +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d57641cb3477..10217393cda6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1515,6 +1515,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, th); + if (mss) { + *cookie = __cookie_v4_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5da069e91cac..87f44d3250ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1063,6 +1063,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) return sk; } +u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, + struct tcphdr *th, u32 *cookie) +{ + u16 mss = 0; +#ifdef CONFIG_SYN_COOKIES + mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, th); + if (mss) { + *cookie = __cookie_v6_init_sequence(iph, th, &mss); + tcp_synq_overflow(sk); + } +#endif + return mss; +} + static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) -- cgit v1.2.3 From 8c0bb7873815bf8c3c4dfb24e8ebf4fefb4c35d2 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 10 Jul 2019 12:05:59 +0200 Subject: netfilter: synproxy: rename mss synproxy_options field After introduce "mss_encode" field in the synproxy_options struct the field "mss" is a little confusing. It has been renamed to "mss_option". Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 2 +- net/ipv4/netfilter/ipt_SYNPROXY.c | 4 ++-- net/ipv6/netfilter/ip6t_SYNPROXY.c | 4 ++-- net/netfilter/nf_synproxy_core.c | 8 ++++---- net/netfilter/nft_synproxy.c | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 44513b93bd55..2f0171d24997 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -67,7 +67,7 @@ static inline struct synproxy_net *synproxy_pernet(struct net *net) struct synproxy_options { u8 options; u8 wscale; - u16 mss; + u16 mss_option; u16 mss_encode; u32 tsval; u32 tsecr; diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 0e70f3f65f6f..748dc3ce58d3 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5cdb4a69d277..fd1f52a21bf1 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -36,8 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; - opts.mss_encode = opts.mss; - opts.mss = info->mss; + opts.mss_encode = opts.mss_option; + opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c769462a839e..b0930d4aba22 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -56,7 +56,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, switch (opcode) { case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { - opts->mss = get_unaligned_be16(ptr); + opts->mss_option = get_unaligned_be16(ptr); opts->options |= NF_SYNPROXY_OPT_MSS; } break; @@ -115,7 +115,7 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | - opts->mss); + opts->mss_option); if (options & NF_SYNPROXY_OPT_TIMESTAMP) { if (options & NF_SYNPROXY_OPT_SACK_PERM) @@ -642,7 +642,7 @@ synproxy_recv_client_ack(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) @@ -1060,7 +1060,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, } this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; + opts->mss_option = mss; opts->options |= NF_SYNPROXY_OPT_MSS; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 928e661d1517..db4c23f5dfcb 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -31,8 +31,8 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts, opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; - opts->mss_encode = opts->mss; - opts->mss = info->mss; + opts->mss_encode = opts->mss_option; + opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else -- cgit v1.2.3 From 1a9914884db5138682032cf69f2d55739f236c80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Aug 2019 05:04:47 -0700 Subject: tcp: batch calls to sk_flush_backlog() Starting from commit d41a69f1d390 ("tcp: make tcp_sendmsg() aware of socket backlog") loopback flows got hurt, because for each skb sent, the socket receives an immediate ACK and sk_flush_backlog() causes extra work. Intent was to not let the backlog grow too much, but we went a bit too far. We can check the backlog every 16 skbs (about 1MB chunks) to increase TCP over loopback performance by about 15 % Note that the call to sk_flush_backlog() handles a single ACK, thanks to coalescing done on backlog, but cleans the 16 skbs found in rtx rb-tree. Reported-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a0a66321c0ee..f8fa1686f7f3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1162,7 +1162,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; - bool process_backlog = false; + int process_backlog = 0; bool zc = false; long timeo; @@ -1254,9 +1254,10 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - if (process_backlog && sk_flush_backlog(sk)) { - process_backlog = false; - goto restart; + if (unlikely(process_backlog >= 16)) { + process_backlog = 0; + if (sk_flush_backlog(sk)) + goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, @@ -1264,7 +1265,7 @@ new_segment: if (!skb) goto wait_for_memory; - process_backlog = true; + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); -- cgit v1.2.3 From c04b79b6cfd714144f6a2cf359603d82ee631e62 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 7 Aug 2019 19:52:29 -0400 Subject: tcp: add new tcp_mtu_probe_floor sysctl The current implementation of TCP MTU probing can considerably underestimate the MTU on lossy connections allowing the MSS to get down to 48. We have found that in almost all of these cases on our networks these paths can handle much larger MTUs meaning the connections are being artificially limited. Even though TCP MTU probing can raise the MSS back up we have seen this not to be the case causing connections to be "stuck" with an MSS of 48 when heavy loss is present. Prior to pushing out this change we could not keep TCP MTU probing enabled b/c of the above reasons. Now with a reasonble floor set we've had it enabled for the past 6 months. The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives administrators the ability to control the floor of MSS probing. Signed-off-by: Josh Hunt Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_timer.c | 2 +- 5 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index df33674799b5..49e95f438ed7 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER Path MTU discovery (MTU probing). If MTU probing is enabled, this is the initial MSS used by the connection. +tcp_mtu_probe_floor - INTEGER + If MTU probing is enabled this caps the minimum MSS used for search_low + for the connection. + + Default : 48 + tcp_min_snd_mss - INTEGER TCP SYN and SYNACK messages usually advertise an ADVMSS option, as described in RFC 1122 and RFC 6691. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index bc24a8ec1ce5..c0c0791b1912 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -116,6 +116,7 @@ struct netns_ipv4 { int sysctl_tcp_l3mdev_accept; #endif int sysctl_tcp_mtu_probing; + int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0b980e841927..59ded25acd04 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -819,6 +819,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &tcp_min_snd_mss_min, .extra2 = &tcp_min_snd_mss_max, }, + { + .procname = "tcp_mtu_probe_floor", + .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d57641cb3477..e0a372676329 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; + net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c801cd37cc2a..dbd9d2d0ee63 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor); mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } -- cgit v1.2.3 From af809709e9df2a44137429ba3694c339a11b710d Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 19 Aug 2019 20:05:15 +0800 Subject: net: remove empty inet_exit_net Pointer members of an object with static storage duration, if not explicitly initialized, will be initialized to a NULL pointer. The net namespace API checks if this pointer is not NULL before using it, it are safe to remove the function. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ed2301ef872e..70f92aaca411 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net) return 0; } -static __net_exit void inet_exit_net(struct net *net) -{ -} - static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, - .exit = inet_exit_net, }; static int __init init_inet_pernet_ops(void) -- cgit v1.2.3 From c76c992525245ec1c7b6738bf887c42099abab02 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Aug 2019 13:53:40 +0100 Subject: nexthops: remove redundant assignment to variable err Variable err is initialized to a value that is never read and it is re-assigned later. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused Value") Signed-off-by: Colin Ian King Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 5fe5a3981d43..fc34fd1668d6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1151,7 +1151,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = l3mdev_fib_table(cfg->dev); - int err = -EINVAL; + int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { -- cgit v1.2.3 From de8e1beb191912f94bcab46e72f247daaf664d15 Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Thu, 29 Aug 2019 10:02:44 -0400 Subject: tcp_bbr: clarify that bbr_bdp() rounds up in comments This explicitly clarifies that bbr_bdp() returns the rounded-up value of the bandwidth-delay product and why in the comments. Signed-off-by: Luke Hsiao Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 56be7d27f208..95b59540eee1 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -346,7 +346,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: * - * bdp = bw * min_rtt * gain + * bdp = ceil(bw * min_rtt * gain) * * The key factor, gain, controls the amount of queue. While a small gain * builds a smaller queue, it becomes more vulnerable to noise in RTT @@ -370,7 +370,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) w = (u64)bw * bbr->min_rtt_us; - /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + /* Apply a gain to the given value, remove the BW_SCALE shift, and + * round the value up to avoid a negative feedback loop. + */ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; return bdp; -- cgit v1.2.3 From 61723b393292f1e4ea27f8d123384d50b176c29d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 30 Aug 2019 12:25:48 +0200 Subject: tcp: ulp: add functions to dump ulp-specific information currently, only getsockopt(TCP_ULP) can be invoked to know if a ULP is on top of a TCP socket. Extend idiag_get_aux() and idiag_get_aux_size(), introduced by commit b37e88407c1d ("inet_diag: allow protocols to provide additional data"), to report the ULP name and other information that can be made available by the ULP through optional functions. Users having CAP_NET_ADMIN privileges will then be able to retrieve this information through inet_diag_handler, if they specify INET_DIAG_INFO in the request. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ include/uapi/linux/inet_diag.h | 8 +++++++ net/ipv4/tcp_diag.c | 52 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index 77fe87f7a992..c9a3f9688223 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2122,6 +2122,9 @@ struct tcp_ulp_ops { void (*update)(struct sock *sk, struct proto *p); /* cleanup ulp */ void (*release)(struct sock *sk); + /* diagnostic */ + int (*get_info)(const struct sock *sk, struct sk_buff *skb); + size_t (*get_info_size)(const struct sock *sk); char name[TCP_ULP_NAME_MAX]; struct module *owner; diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index e8baca85bac6..e2c6273274f3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -153,11 +153,19 @@ enum { INET_DIAG_BBRINFO, /* request as INET_DIAG_VEGASINFO */ INET_DIAG_CLASS_ID, /* request as INET_DIAG_TCLASS */ INET_DIAG_MD5SIG, + INET_DIAG_ULP_INFO, __INET_DIAG_MAX, }; #define INET_DIAG_MAX (__INET_DIAG_MAX - 1) +enum { + INET_ULP_INFO_UNSPEC, + INET_ULP_INFO_NAME, + __INET_ULP_INFO_MAX, +}; +#define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1) + /* INET_DIAG_MEM */ struct inet_diag_meminfo { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a3a386236d93..babc156deabb 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -81,13 +81,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb, } #endif +static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, + const struct tcp_ulp_ops *ulp_ops) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name); + if (err) + goto nla_failure; + + if (ulp_ops->get_info) + err = ulp_ops->get_info(sk, skb); + if (err) + goto nla_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_failure: + nla_nest_cancel(skb, nest); + return err; +} + static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); + int err = 0; + #ifdef CONFIG_TCP_MD5SIG if (net_admin) { struct tcp_md5sig_info *md5sig; - int err = 0; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); @@ -99,11 +128,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, } #endif + if (net_admin) { + const struct tcp_ulp_ops *ulp_ops; + + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) + err = tcp_diag_put_ulp(skb, sk, ulp_ops); + if (err) + return err; + } return 0; } static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) { + struct inet_connection_sock *icsk = inet_csk(sk); size_t size = 0; #ifdef CONFIG_TCP_MD5SIG @@ -124,6 +163,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } #endif + if (net_admin) { + const struct tcp_ulp_ops *ulp_ops; + + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) { + size += nla_total_size(0) + + nla_total_size(TCP_ULP_NAME_MAX); + if (ulp_ops->get_info_size) + size += ulp_ops->get_info_size(sk); + } + } return size; } -- cgit v1.2.3 From b58662a5f7f4677debd5e28d10145cf5decd516b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Sep 2019 13:20:41 -0700 Subject: tcp: ulp: fix possible crash in tcp_diag_get_aux_size() tcp_diag_get_aux_size() can be called with sockets in any state. icsk_ulp_ops is only present for full sockets. For SYN_RECV or TIME_WAIT ones we would access garbage. Fixes: 61723b393292 ("tcp: ulp: add functions to dump ulp-specific information") Signed-off-by: Eric Dumazet Reported-by: Luke Hsiao Reported-by: Neal Cardwell Cc: Davide Caratti Acked-by: Davide Caratti Signed-off-by: David S. Miller --- net/ipv4/tcp_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index babc156deabb..81a8221d650a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -163,7 +163,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } #endif - if (net_admin) { + if (net_admin && sk_fullsock(sk)) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; -- cgit v1.2.3 From 0079ad8e8dc3a4d1af0dd4a53345580a6947beba Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 6 Sep 2019 15:36:01 +0800 Subject: ipmr: remove hard code cache_resolve_queue_len limit This is a re-post of previous patch wrote by David Miller[1]. Phil Karn reported[2] that on busy networks with lots of unresolved multicast routing entries, the creation of new multicast group routes can be extremely slow and unreliable. The reason is we hard-coded multicast route entries with unresolved source addresses(cache_resolve_queue_len) to 10. If some multicast route never resolves and the unresolved source addresses increased, there will be no ability to create new multicast route cache. To resolve this issue, we need either add a sysctl entry to make the cache_resolve_queue_len configurable, or just remove cache_resolve_queue_len limit directly, as we already have the socket receive queue limits of mrouted socket, pointed by David. >From my side, I'd perfer to remove the cache_resolve_queue_len limit instead of creating two more(IPv4 and IPv6 version) sysctl entry. [1] https://lkml.org/lkml/2018/7/22/11 [2] https://lkml.org/lkml/2018/7/21/343 v3: instead of remove cache_resolve_queue_len totally, let's only remove the hard code limit when allocate the unresolved cache, as Eric Dumazet suggested, so we don't need to re-count it in other places. v2: hold the mfc_unres_lock while walking the unresolved list in queue_count(), as Nikolay Aleksandrov remind. Reported-by: Phil Karn Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 4 ++-- net/ipv6/ip6mr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c07bc82cbbe9..313470f6bb14 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1134,8 +1134,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, if (!found) { /* Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ipmr_cache_alloc_unres()) == NULL) { + c = ipmr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e80d36c5073d..857a89ad4d6c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1148,8 +1148,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, * Create a new entry if allowable */ - if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || - (c = ip6mr_cache_alloc_unres()) == NULL) { + c = ip6mr_cache_alloc_unres(); + if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); -- cgit v1.2.3 From 051ba67447de1294aaacc59752a37f72091d15ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Sep 2019 14:49:28 -0700 Subject: tcp: force a PSH flag on TSO packets When tcp sends a TSO packet, adding a PSH flag on it reduces the sojourn time of GRO packet in GRO receivers. This is particularly the case under pressure, since RX queues receive packets for many concurrent flows. A sender can give a hint to GRO engines when it is appropriate to flush a super-packet, especially when pacing is in the picture, since next packet is probably delayed by one ms. Having less packets in GRO engine reduces chance of LRU eviction or inflated RTT, and reduces GRO cost. We found recently that we must not set the PSH flag on individual full-size MSS segments [1] : Under pressure (CWR state), we better let the packet sit for a small delay (depending on NAPI logic) so that the ACK packet is delayed, and thus next packet we send is also delayed a bit. Eventually the bottleneck queue can be drained. DCTCP flows with CWND=1 have demonstrated the issue. This patch allows to slowdown the aggregate traffic without involving high resolution timers on senders and/or receivers. It has been used at Google for about four years, and has been discussed at various networking conferences. [1] segments smaller than MSS already have PSH flag set by tcp_sendmsg() / tcp_mark_push(), unless MSG_MORE has been requested by the user. Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Daniel Borkmann Cc: Tariq Toukan Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 42abc9bd687a..fec6d67bfd14 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1050,11 +1050,22 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); - else + } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &md5); + /* Force a PSH flag on all (GSO) packets to expedite GRO flush + * at receiver : This slightly improve GRO performance. + * Note that we do not force the PSH flag for non GSO packets, + * because they might be sent under high congestion events, + * and in this case it is better to delay the delivery of 1-MSS + * packets and thus the corresponding ACK packet that would + * release the following packet. + */ + if (tcp_skb_pcount(skb) > 1) + tcb->tcp_flags |= TCPHDR_PSH; + } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* if no packet is in qdisc/device queue, then allow XPS to select -- cgit v1.2.3 From b0edba2af7154c82c28a4828f483c102ab201326 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Fri, 13 Sep 2019 09:13:02 +0100 Subject: netfilter: fix coding-style errors. Several header-files, Kconfig files and Makefiles have trailing white-space. Remove it. In netfilter/Kconfig, indent the type of CONFIG_NETFILTER_NETLINK_ACCT correctly. There are semicolons at the end of two function definitions in include/net/netfilter/nf_conntrack_acct.h and include/net/netfilter/nf_conntrack_ecache.h. Remove them. Fix indentation in nf_conntrack_l4proto.h. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- include/linux/netfilter_ipv6.h | 2 +- include/net/netfilter/nf_conntrack_acct.h | 2 +- include/net/netfilter/nf_conntrack_ecache.h | 2 +- include/net/netfilter/nf_conntrack_expect.h | 2 +- include/net/netfilter/nf_conntrack_l4proto.h | 14 +++++++------- include/net/netfilter/nf_conntrack_tuple.h | 2 +- net/ipv4/netfilter/Kconfig | 8 ++++---- net/ipv4/netfilter/Makefile | 2 +- net/netfilter/Kconfig | 8 ++++---- net/netfilter/Makefile | 2 +- 11 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index ae62bf1c6824..b9bc25f57c8e 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -340,7 +340,7 @@ void xt_free_table_info(struct xt_table_info *info); /** * xt_recseq - recursive seqcount for netfilter use - * + * * Packet processing changes the seqcount only if no recursion happened * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 7beb681e1ce5..a889e376d197 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -1,7 +1,7 @@ /* IPv6-specific defines for netfilter. * (C)1998 Rusty Russell -- This code is GPL. * (C)1999 David Jeffery - * this header was blatantly ripped from netfilter_ipv4.h + * this header was blatantly ripped from netfilter_ipv4.h * it's amazing what adding a bunch of 6s can do =8^) */ #ifndef __LINUX_IP6_NETFILTER_H diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index ad9f2172dee1..5b5287bb49db 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -45,7 +45,7 @@ struct nf_conn_acct *nf_ct_acct_ext_add(struct nf_conn *ct, gfp_t gfp) #else return NULL; #endif -}; +} /* Check if connection tracking accounting is enabled */ static inline bool nf_ct_acct_enabled(struct net *net) diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 52b44192b43f..0815bfadfefe 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -61,7 +61,7 @@ nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) #else return NULL; #endif -}; +} #ifdef CONFIG_NF_CONNTRACK_EVENTS /* This structure is passed to event handler */ diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 573429be4d59..0855b60fba17 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -126,7 +126,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, const union nf_inet_addr *, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); -int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect, unsigned int flags) diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index c200b95d27ae..97240f1a3f5f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -181,41 +181,41 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { - return &net->ct.nf_ct_proto.generic; + return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { - return &net->ct.nf_ct_proto.tcp; + return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { - return &net->ct.nf_ct_proto.udp; + return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { - return &net->ct.nf_ct_proto.icmp; + return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { - return &net->ct.nf_ct_proto.icmpv6; + return &net->ct.nf_ct_proto.icmpv6; } #endif #ifdef CONFIG_NF_CT_PROTO_DCCP static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) { - return &net->ct.nf_ct_proto.dccp; + return &net->ct.nf_ct_proto.dccp; } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { - return &net->ct.nf_ct_proto.sctp; + return &net->ct.nf_ct_proto.sctp; } #endif diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index 480c87b44a96..68ea9b932736 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -124,7 +124,7 @@ struct nf_conntrack_tuple_hash { #if IS_ENABLED(CONFIG_NETFILTER) static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) -{ +{ return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) && t1->src.u.all == t2->src.u.all && t1->src.l3num == t2->src.l3num); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 69e76d677f9e..f17b402111ce 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -272,7 +272,7 @@ config IP_NF_TARGET_CLUSTERIP The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing router/server/switch. - + To compile it as a module, choose M here. If unsure, say N. config IP_NF_TARGET_ECN @@ -281,7 +281,7 @@ config IP_NF_TARGET_ECN depends on NETFILTER_ADVANCED ---help--- This option adds a `ECN' target, which can be used in the iptables mangle - table. + table. You can use this target to remove the ECN bits from the IPv4 header of an IP packet. This is particularly useful, if you need to work around @@ -306,7 +306,7 @@ config IP_NF_RAW This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING and OUTPUT chains. - + If you want to compile it as a module, say M here and read . If unsure, say `N'. @@ -318,7 +318,7 @@ config IP_NF_SECURITY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. - + If unsure, say N. endif # IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c50e0ec095d2..7c497c78105f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,7 +31,7 @@ obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o # flow table support obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o -# generic IP tables +# generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0d65f4d39494..34ec7afec116 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -20,7 +20,7 @@ config NETFILTER_FAMILY_ARP bool config NETFILTER_NETLINK_ACCT -tristate "Netfilter NFACCT over NFNETLINK interface" + tristate "Netfilter NFACCT over NFNETLINK interface" depends on NETFILTER_ADVANCED select NETFILTER_NETLINK help @@ -34,7 +34,7 @@ config NETFILTER_NETLINK_QUEUE help If this option is enabled, the kernel will include support for queueing packets via NFNETLINK. - + config NETFILTER_NETLINK_LOG tristate "Netfilter LOG over NFNETLINK interface" default m if NETFILTER_ADVANCED=n @@ -1502,7 +1502,7 @@ config NETFILTER_XT_MATCH_REALM This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. - This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option in tc world. If you want to compile it as a module, say M here and read @@ -1523,7 +1523,7 @@ config NETFILTER_XT_MATCH_SCTP depends on NETFILTER_ADVANCED default IP_SCTP help - With this option enabled, you will be able to use the + With this option enabled, you will be able to use the `sctp' match in order to match on SCTP source/destination ports and SCTP chunk types. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 9270a7fae484..4fc075b612fe 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -124,7 +124,7 @@ nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o -# generic X tables +# generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o # combos -- cgit v1.2.3 From c6af0c227a22bb6bb8ff72f043e0fb6d99fd6515 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Sep 2019 15:50:51 -0400 Subject: ip: support SO_MARK cmsg Enable setting skb->mark for UDP and RAW sockets using cmsg. This is analogous to existing support for TOS, TTL, txtime, etc. Packet sockets already support this as of commit c7d39e32632e ("packet: support per-packet fwmark for af_packet sendmsg"). Similar to other fields, implement by 1. initialize the sockcm_cookie.mark from socket option sk_mark 2. optionally overwrite this in ip_cmsg_send/ip6_datagram_send_ctl 3. initialize inet_cork.mark from sockcm_cookie.mark 4. initialize each (usually just one) skb->mark from inet_cork.mark Step 1 is handled in one location for most protocols by ipcm_init_sk as of commit 351782067b6b ("ipv4: ipcm_cookie initializers"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + include/net/ip.h | 1 + net/ipv4/ip_output.c | 3 ++- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/raw.c | 4 +++- net/ipv6/udp.c | 3 ++- 9 files changed, 15 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 7769c9b36d75..34c4436fd18f 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -160,6 +160,7 @@ struct inet_cork { char priority; __u16 gso_size; u64 transmit_time; + u32 mark; }; struct inet_cork_full { diff --git a/include/net/ip.h b/include/net/ip.h index 29d89de39822..95bb77f95bcc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -88,6 +88,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); + ipcm->sockc.mark = inet->sk.sk_mark; ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = inet->sk.sk_bound_dev_if; ipcm->addr = inet->inet_saddr; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc7ef0d05bbd..5eb73775c3f7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1266,6 +1266,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; + cork->mark = ipc->sockc.mark; cork->priority = ipc->priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; @@ -1529,7 +1530,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, } skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->mark; skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 9d24ef5c5d8f..535427292194 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = inet->uc_index; - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 40a6abbc9cf6..80da5a66d5d7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d88821c794fb..fbcd9be3a470 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1130,7 +1130,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e49fd62eea9..89a4c7c2e25d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1294,6 +1294,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; + cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) @@ -1764,7 +1765,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->daddr = *final_dst; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = cork->base.mark; skb->tstamp = cork->base.transmit_time; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8a6131991e38..6e1888ee4036 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -646,7 +646,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_put(skb, length); @@ -810,6 +810,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -891,6 +892,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; + fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 827fe7385078..2c8beb3896d1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1230,6 +1230,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.mark = sk->sk_mark; /* destination address check */ if (sin6) { @@ -1352,7 +1353,7 @@ do_udp_sendmsg: if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { -- cgit v1.2.3 From acdcecc61285faed359f1a3568c32089cc3a8329 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 12 Sep 2019 21:16:39 -0400 Subject: udp: correct reuseport selection with connected sockets UDP reuseport groups can hold a mix unconnected and connected sockets. Ensure that connections only receive all traffic to their 4-tuple. Fast reuseport returns on the first reuseport match on the assumption that all matches are equal. Only if connections are present, return to the previous behavior of scoring all sockets. Record if connections are present and if so (1) treat such connected sockets as an independent match from the group, (2) only return 2-tuple matches from reuseport and (3) do not return on the first 2-tuple reuseport match to allow for a higher scoring match later. New field has_conns is set without locks. No other fields in the bitmap are modified at runtime and the field is only ever set unconditionally, so an RMW cannot miss a change. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Link: http://lkml.kernel.org/r/CA+FuTSfRP09aJNYRt04SS6qj22ViiOEWaWmLAwX0psk8-PGNxw@mail.gmail.com Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Acked-by: Craig Gallek Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock_reuseport.h | 20 +++++++++++++++++++- net/core/sock_reuseport.c | 15 +++++++++++++-- net/ipv4/datagram.c | 2 ++ net/ipv4/udp.c | 5 +++-- net/ipv6/datagram.c | 2 ++ net/ipv6/udp.c | 5 +++-- 6 files changed, 42 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index d9112de85261..43f4a818d88f 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,7 +21,8 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; - bool bind_inany; + unsigned int bind_inany:1; + unsigned int has_conns:1; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; @@ -37,6 +38,23 @@ extern struct sock *reuseport_select_sock(struct sock *sk, extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); extern int reuseport_detach_prog(struct sock *sk); +static inline bool reuseport_has_conns(struct sock *sk, bool set) +{ + struct sock_reuseport *reuse; + bool ret = false; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (reuse) { + if (set) + reuse->has_conns = 1; + ret = reuse->has_conns; + } + rcu_read_unlock(); + + return ret; +} + int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 9408f9264d05..f3ceec93f392 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -295,8 +295,19 @@ struct sock *reuseport_select_sock(struct sock *sk, select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ - if (!sk2) - sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + if (!sk2) { + int i, j; + + i = j = reciprocal_scale(hash, socks); + while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + i++; + if (i >= reuse->num_socks) + i = 0; + if (i == j) + goto out; + } + sk2 = reuse->socks[i]; + } } out: diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 7bd29e694603..9a0fe0c2fa02 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -15,6 +15,7 @@ #include #include #include +#include int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -69,6 +70,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); inet->inet_id = jiffies; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d88821c794fb..16486c8b708b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } badness = score; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9ab897ded4df..96f939248d2f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -254,6 +255,7 @@ ipv4_connected: goto out; } + reuseport_has_conns(sk, true); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 827fe7385078..5995fdc99d3f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -158,13 +158,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - if (sk->sk_reuseport) { + if (sk->sk_reuseport && + sk->sk_state != TCP_ESTABLISHED) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (result) + if (result && !reuseport_has_conns(sk, false)) return result; } result = sk; -- cgit v1.2.3 From f9af2dbbfe01def62765a58af7fbc488351893c3 Mon Sep 17 00:00:00 2001 From: Thomas Higdon Date: Fri, 13 Sep 2019 23:23:34 +0000 Subject: tcp: Add TCP_INFO counter for packets received out-of-order For receive-heavy cases on the server-side, we want to track the connection quality for individual client IPs. This counter, similar to the existing system-wide TCPOFOQueue counter in /proc/net/netstat, tracks out-of-order packet reception. By providing this counter in TCP_INFO, it will allow understanding to what degree receive-heavy sockets are experiencing out-of-order delivery and packet drops indicating congestion. Please note that this is similar to the counter in NetBSD TCP_INFO, and has the same name. Also note that we avoid increasing the size of the tcp_sock struct by taking advantage of a hole. Signed-off-by: Thomas Higdon Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 1 + 4 files changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f3a85a7fb4b1..99617e528ea2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -354,6 +354,8 @@ struct tcp_sock { #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif + u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ + /* Receiver side RTT estimation */ u32 rcv_rtt_last_tsecr; struct { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b3564f85a762..20237987ccc8 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -270,6 +270,8 @@ struct tcp_info { __u64 tcpi_bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans */ __u32 tcpi_dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups */ __u32 tcpi_reord_seen; /* reordering events seen */ + + __u32 tcpi_rcv_ooopack; /* Out-of-order packets received */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 94df48bcecc2..4cf58208270e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2653,6 +2653,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ @@ -3295,6 +3296,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; + info->tcpi_rcv_ooopack = tp->rcv_ooopack; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7e94223fdb2b..3578357abe30 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4555,6 +4555,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); seq = TCP_SKB_CB(skb)->seq; end_seq = TCP_SKB_CB(skb)->end_seq; -- cgit v1.2.3 From 8f7baad7f03543451af27f5380fc816b008aa1f2 Mon Sep 17 00:00:00 2001 From: Thomas Higdon Date: Fri, 13 Sep 2019 23:23:35 +0000 Subject: tcp: Add snd_wnd to TCP_INFO Neal Cardwell mentioned that snd_wnd would be useful for diagnosing TCP performance problems -- > (1) Usually when we're diagnosing TCP performance problems, we do so > from the sender, since the sender makes most of the > performance-critical decisions (cwnd, pacing, TSO size, TSQ, etc). > From the sender-side the thing that would be most useful is to see > tp->snd_wnd, the receive window that the receiver has advertised to > the sender. This serves the purpose of adding an additional __u32 to avoid the would-be hole caused by the addition of the tcpi_rcvi_ooopack field. Signed-off-by: Thomas Higdon Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 4 ++++ net/ipv4/tcp.c | 1 + 2 files changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 20237987ccc8..81e697978e8b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -272,6 +272,10 @@ struct tcp_info { __u32 tcpi_reord_seen; /* reordering events seen */ __u32 tcpi_rcv_ooopack; /* Out-of-order packets received */ + + __u32 tcpi_snd_wnd; /* peer's advertised receive window after + * scaling (bytes) + */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4cf58208270e..79c325a07ba5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3297,6 +3297,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; + info->tcpi_snd_wnd = tp->snd_wnd; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); -- cgit v1.2.3