summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c28
-rw-r--r--net/core/dev.c596
-rw-r--r--net/core/dev_addr_lists.c85
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/dst.c31
-rw-r--r--net/core/ethtool.c216
-rw-r--r--net/core/filter.c1451
-rw-r--r--net/core/flow.c140
-rw-r--r--net/core/flow_dissector.c105
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/iovec.c61
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/neighbour.c24
-rw-r--r--net/core/net-sysfs.c178
-rw-r--r--net/core/net_namespace.c14
-rw-r--r--net/core/netclassid_cgroup.c19
-rw-r--r--net/core/netpoll.c590
-rw-r--r--net/core/netprio_cgroup.c55
-rw-r--r--net/core/pktgen.c203
-rw-r--r--net/core/ptp_classifier.c193
-rw-r--r--net/core/request_sock.c44
-rw-r--r--net/core/rtnetlink.c376
-rw-r--r--net/core/secure_seq.c25
-rw-r--r--net/core/skbuff.c329
-rw-r--r--net/core/sock.c141
-rw-r--r--net/core/sock_diag.c27
-rw-r--r--net/core/timestamping.c72
-rw-r--r--net/core/tso.c77
-rw-r--r--net/core/utils.c8
31 files changed, 2956 insertions, 2143 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 5038f1ea0349..235e6c50708d 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o
+ sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
@@ -20,5 +20,6 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a16ed7bbe376..fdbc9a81d4c2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -740,22 +740,42 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
+ skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
- return __skb_checksum_complete_head(skb, skb->len);
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+
+ return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
/**
- * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
* @iov: io vector
diff --git a/net/core/dev.c b/net/core/dev.c
index 5e37e9abe8c5..130d64220229 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
#include "net-sysfs.h"
@@ -148,6 +149,9 @@ struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
@@ -1082,6 +1086,7 @@ static int dev_get_valid_name(struct net *net,
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
+ unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
@@ -1109,10 +1114,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
return err;
}
+ if (oldname[0] && !strchr(oldname, '%'))
+ netdev_info(dev, "renamed from %s\n", oldname);
+
+ old_assign_type = dev->name_assign_type;
+ dev->name_assign_type = NET_NAME_RENAMED;
+
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
write_seqcount_end(&devnet_rename_seq);
return ret;
}
@@ -1141,6 +1153,8 @@ rollback:
write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
pr_err("%s: name change rollback failed: %d\n",
@@ -1207,7 +1221,11 @@ EXPORT_SYMBOL(netdev_features_change);
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
- call_netdevice_notifiers(NETDEV_CHANGE, dev);
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
@@ -1245,7 +1263,7 @@ static int __dev_open(struct net_device *dev)
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
- netpoll_rx_disable(dev);
+ netpoll_poll_disable(dev);
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
@@ -1260,7 +1278,7 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
- netpoll_rx_enable(dev);
+ netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1312,6 +1330,9 @@ static int __dev_close_many(struct list_head *head)
might_sleep();
list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
@@ -1322,7 +1343,7 @@ static int __dev_close_many(struct list_head *head)
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
- smp_mb__after_clear_bit(); /* Commit netif_running(). */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
}
dev_deactivate_many(head);
@@ -1341,6 +1362,7 @@ static int __dev_close_many(struct list_head *head)
ops->ndo_stop(dev);
dev->flags &= ~IFF_UP;
+ netpoll_poll_enable(dev);
}
return 0;
@@ -1351,14 +1373,10 @@ static int __dev_close(struct net_device *dev)
int retval;
LIST_HEAD(single);
- /* Temporarily disable netpoll until the interface is down */
- netpoll_rx_disable(dev);
-
list_add(&dev->close_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
- netpoll_rx_enable(dev);
return retval;
}
@@ -1396,14 +1414,9 @@ int dev_close(struct net_device *dev)
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
- /* Block netpoll rx while the interface is going down */
- netpoll_rx_disable(dev);
-
list_add(&dev->close_list, &single);
dev_close_many(&single);
list_del(&single);
-
- netpoll_rx_enable(dev);
}
return 0;
}
@@ -1643,8 +1656,7 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-static inline bool is_skb_forwardable(struct net_device *dev,
- struct sk_buff *skb)
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
{
unsigned int len;
@@ -1663,6 +1675,30 @@ static inline bool is_skb_forwardable(struct net_device *dev,
return false;
}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ }
+
+ if (unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
@@ -1684,24 +1720,7 @@ static inline bool is_skb_forwardable(struct net_device *dev,
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
- if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
- }
-
- if (unlikely(!is_skb_forwardable(dev, skb))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
-
- skb_scrub_packet(skb, true);
- skb->protocol = eth_type_trans(skb, dev);
-
- return netif_rx_internal(skb);
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -2284,10 +2303,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-__be16 skb_network_protocol(struct sk_buff *skb)
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
+ unsigned int vlan_depth = skb->mac_len;
__be16 type = skb->protocol;
- int vlan_depth = ETH_HLEN;
/* Tunnel gso handlers can set protocol to ethernet. */
if (type == htons(ETH_P_TEB)) {
@@ -2300,17 +2319,34 @@ __be16 skb_network_protocol(struct sk_buff *skb)
type = eth->h_proto;
}
- while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
- struct vlan_hdr *vh;
+ /* if skb->protocol is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (WARN_ON(vlan_depth < VLAN_HLEN))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
- if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
- return 0;
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
- vh = (struct vlan_hdr *)(skb->data + vlan_depth);
- type = vh->h_vlan_encapsulated_proto;
- vlan_depth += VLAN_HLEN;
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
}
+ *depth = vlan_depth;
+
return type;
}
@@ -2324,12 +2360,13 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
- __be16 type = skb_network_protocol(skb);
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
if (unlikely(!type))
return ERR_PTR(-EINVAL);
- __skb_pull(skb, skb->mac_len);
+ __skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
@@ -2386,8 +2423,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
skb_warn_bad_offload(skb);
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
return ERR_PTR(err);
}
@@ -2418,7 +2455,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
* 2. No high memory really exists on this machine.
*/
-static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
@@ -2492,47 +2529,79 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
return 0;
}
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#ifdef CONFIG_NET_MPLS_GSO
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
+#endif
+
static netdev_features_t harmonize_features(struct sk_buff *skb,
- const struct net_device *dev,
- netdev_features_t features)
+ netdev_features_t features)
{
+ int tmp;
+ __be16 type;
+
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
+
if (skb->ip_summed != CHECKSUM_NONE &&
- !can_checksum_protocol(features, skb_network_protocol(skb))) {
+ !can_checksum_protocol(features, type)) {
features &= ~NETIF_F_ALL_CSUM;
- } else if (illegal_highdma(dev, skb)) {
+ } else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
return features;
}
-netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
- const struct net_device *dev)
+netdev_features_t netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
- netdev_features_t features = dev->features;
+ netdev_features_t features = skb->dev->features;
- if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
+ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
features &= ~NETIF_F_GSO_MASK;
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
} else if (!vlan_tx_tag_present(skb)) {
- return harmonize_features(skb, dev, features);
+ return harmonize_features(skb, features);
}
- features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX);
+ features = netdev_intersect_features(features,
+ skb->dev->vlan_features |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
- features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX;
+ features = netdev_intersect_features(features,
+ NETIF_F_SG |
+ NETIF_F_HIGHDMA |
+ NETIF_F_FRAGLIST |
+ NETIF_F_GEN_CSUM |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
- return harmonize_features(skb, dev, features);
+ return harmonize_features(skb, features);
}
-EXPORT_SYMBOL(netif_skb_dev_features);
+EXPORT_SYMBOL(netif_skb_features);
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
@@ -2691,8 +2760,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
- * This permits __QDISC_STATE_RUNNING owner to get the lock more often
- * and dequeue packets faster.
+ * This permits __QDISC___STATE_RUNNING owner to get the lock more
+ * often and dequeue packets faster.
*/
contended = qdisc_is_running(q);
if (unlikely(contended))
@@ -2812,6 +2881,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_reset_mac_header(skb);
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
@@ -2878,6 +2950,7 @@ recursion_alert:
rc = -ENETDOWN;
rcu_read_unlock_bh();
+ atomic_long_inc(&dev->tx_dropped);
kfree_skb(skb);
return rc;
out:
@@ -2950,7 +3023,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb->rxhash & flow_table->mask;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -2984,6 +3057,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u16 tcpu;
+ u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
@@ -3012,7 +3086,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
skb_reset_network_header(skb);
- if (!skb_get_hash(skb))
+ hash = skb_get_hash(skb);
+ if (!hash)
goto done;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
@@ -3021,11 +3096,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
u16 next_cpu;
struct rps_dev_flow *rflow;
- rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[skb->rxhash &
- sock_flow_table->mask];
+ next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
/*
* If the desired CPU (where last recvmsg was done) is
@@ -3054,7 +3128,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
if (map) {
- tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) hash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
@@ -3229,10 +3303,6 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- /* if netpoll wants it, pretend we never saw it */
- if (netpoll_rx(skb))
- return NET_RX_DROP;
-
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
@@ -3343,7 +3413,7 @@ static void net_tx_action(struct softirq_action *h)
root_lock = qdisc_lock(q);
if (spin_trylock(root_lock)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
qdisc_run(q);
@@ -3353,7 +3423,7 @@ static void net_tx_action(struct softirq_action *h)
&q->state)) {
__netif_reschedule(q);
} else {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
}
@@ -3439,7 +3509,7 @@ out:
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
- * Register a receive hander for a device. This handler will then be
+ * Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
@@ -3493,11 +3563,11 @@ EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
- case __constant_htons(ETH_P_ARP):
- case __constant_htons(ETH_P_IP):
- case __constant_htons(ETH_P_IPV6):
- case __constant_htons(ETH_P_8021Q):
- case __constant_htons(ETH_P_8021AD):
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
return true;
default:
return false;
@@ -3518,10 +3588,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
trace_netif_receive_skb(skb);
- /* if we've gotten here through NAPI, check netpoll */
- if (netpoll_receive_skb(skb))
- goto out;
-
orig_dev = skb->dev;
skb_reset_network_header(skb);
@@ -3540,7 +3606,7 @@ another_round:
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
- skb = vlan_untag(skb);
+ skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto unlock;
}
@@ -3648,7 +3714,6 @@ drop:
unlock:
rcu_read_unlock();
-out:
return ret;
}
@@ -3838,10 +3903,10 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
diffs |= p->vlan_tci ^ skb->vlan_tci;
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
- skb_gro_mac_header(skb));
+ skb_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
- skb_gro_mac_header(skb),
+ skb_mac_header(skb),
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
@@ -3864,6 +3929,27 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
}
}
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
@@ -3872,14 +3958,14 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
struct list_head *head = &offload_base;
int same_flow;
enum gro_result ret;
+ int grow;
- if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+ if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal;
- skb_gro_reset_offset(skb);
gro_list_prepare(napi, skb);
NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
@@ -3937,33 +4023,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
ret = GRO_HELD;
pull:
- if (skb_headlen(skb) < skb_gro_offset(skb)) {
- int grow = skb_gro_offset(skb) - skb_headlen(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->tail += grow;
- skb->data_len -= grow;
-
- skb_shinfo(skb)->frags[0].page_offset += grow;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(skb_shinfo(skb)->frags,
- skb_shinfo(skb)->frags + 1,
- --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
- }
- }
-
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
ok:
return ret;
@@ -4031,6 +4100,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
trace_napi_gro_receive_entry(skb);
+ skb_gro_reset_offset(skb);
+
return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
@@ -4043,6 +4114,9 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->vlan_tci = 0;
skb->dev = napi->dev;
skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
napi->skb = skb;
}
@@ -4059,12 +4133,16 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_get_frags);
-static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb_internal(skb))
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
@@ -4073,7 +4151,6 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
napi_reuse_skb(napi, skb);
break;
- case GRO_HELD:
case GRO_MERGED:
break;
}
@@ -4081,17 +4158,41 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
return ret;
}
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
- if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
- napi_reuse_skb(napi, skb);
- return NULL;
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
- skb->protocol = eth_type_trans(skb, skb->dev);
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
return skb;
}
@@ -4128,8 +4229,8 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
- __smp_call_function_single(remsd->cpu,
- &remsd->csd, 0);
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
remsd = next;
}
} else
@@ -4153,9 +4254,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
#endif
napi->weight = weight_p;
local_irq_disable();
- while (work < quota) {
+ while (1) {
struct sk_buff *skb;
- unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
local_irq_enable();
@@ -4169,24 +4269,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
rps_lock(sd);
- qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen)
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
-
- if (qlen < quota - work) {
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
- * and NAPI_STATE_SCHED is the only possible flag set on backlog.
- * we can use a plain write instead of clear_bit(),
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
list_del(&napi->poll_list);
napi->state = 0;
+ rps_unlock(sd);
- quota = work + qlen;
+ break;
}
+
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
rps_unlock(sd);
}
local_irq_enable();
@@ -4216,7 +4316,7 @@ void __napi_complete(struct napi_struct *n)
BUG_ON(n->gro_list);
list_del(&n->poll_list);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
@@ -4507,6 +4607,32 @@ void *netdev_adjacent_get_private(struct list_head *adj_list)
EXPORT_SYMBOL(netdev_adjacent_get_private);
/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+/**
* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
* @dev: device
* @iter: list_head ** of the current position
@@ -4553,8 +4679,7 @@ void *netdev_lower_get_next_private(struct net_device *dev,
if (&lower->list == &dev->adj_list.lower)
return NULL;
- if (iter)
- *iter = lower->list.next;
+ *iter = lower->list.next;
return lower->private;
}
@@ -4582,14 +4707,39 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
if (&lower->list == &dev->adj_list.lower)
return NULL;
- if (iter)
- *iter = &lower->list;
+ *iter = &lower->list;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
@@ -4649,9 +4799,14 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev,
sysfs_remove_link(&(dev->dev.kobj), linkname);
}
-#define netdev_adjacent_is_neigh_list(dev, dev_list) \
- (dev_list == &dev->adj_list.upper || \
- dev_list == &dev->adj_list.lower)
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ return (dev_list == &dev->adj_list.upper ||
+ dev_list == &dev->adj_list.lower) &&
+ net_eq(dev_net(dev), dev_net(adj_dev));
+}
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
@@ -4681,7 +4836,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
pr_debug("dev_hold for %s, because of link added from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
- if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
if (ret)
goto free_adj;
@@ -4702,7 +4857,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
return 0;
remove_symlinks:
- if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
kfree(adj);
@@ -4735,7 +4890,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
- if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
@@ -5005,11 +5160,65 @@ void netdev_upper_dev_unlink(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+void netdev_adjacent_add_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.lower);
+ }
+}
+
+void netdev_adjacent_del_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.lower);
+ }
+}
+
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
struct netdev_adjacent *iter;
+ struct net *net = dev_net(dev);
+
list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5017,6 +5226,8 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5039,6 +5250,30 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
+{
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
+
+ ASSERT_RTNL();
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
+ }
+
+ if (type_check(dev))
+ max_nest++;
+
+ return max_nest;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -5276,13 +5511,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
*/
ret = 0;
- if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
+ if ((old_flags ^ flags) & IFF_UP)
ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
- if (!ret)
- dev_set_rx_mode(dev);
- }
-
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
unsigned int old_flags = dev->flags;
@@ -5508,7 +5739,7 @@ static int dev_new_index(struct net *net)
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
-static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
@@ -5565,10 +5796,6 @@ static void rollback_registered_many(struct list_head *head)
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
-
/*
* Flush the unicast and multicast chains
*/
@@ -5578,6 +5805,10 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
@@ -5661,6 +5892,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
+
return features;
}
@@ -5796,10 +6034,7 @@ static void netdev_init_one_queue(struct net_device *dev,
static void netif_free_tx_queues(struct net_device *dev)
{
- if (is_vmalloc_addr(dev->_tx))
- vfree(dev->_tx);
- else
- kfree(dev->_tx);
+ kvfree(dev->_tx);
}
static int netif_alloc_netdev_queues(struct net_device *dev)
@@ -6236,6 +6471,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -6272,25 +6508,24 @@ void netdev_freemem(struct net_device *dev)
{
char *addr = (char *)dev - dev->padded;
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
+ kvfree(addr);
}
/**
* alloc_netdev_mqs - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @setup: callback to initialize device
- * @txqs: the number of TX subqueues to allocate
- * @rxqs: the number of RX subqueues to allocate
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
@@ -6369,6 +6604,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
strcpy(dev->name, name);
+ dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
@@ -6380,11 +6616,6 @@ free_all:
free_pcpu:
free_percpu(dev->pcpu_refcnt);
- netif_free_tx_queues(dev);
-#ifdef CONFIG_SYSFS
- kfree(dev->_rx);
-#endif
-
free_dev:
netdev_freemem(dev);
return NULL;
@@ -6481,6 +6712,9 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
@@ -6490,6 +6724,7 @@ void unregister_netdevice_many(struct list_head *head)
rollback_registered_many(head);
list_for_each_entry(dev, head, unreg_list)
net_set_todo(dev);
+ list_del(head);
}
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -6595,6 +6830,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Send a netdev-removed uevent to the old namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+ netdev_adjacent_del_links(dev);
/* Actually switch the network namespace */
dev_net_set(dev, net);
@@ -6609,6 +6845,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
@@ -6781,12 +7018,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev,
if (dev && dev->dev.parent) {
r = dev_printk_emit(level[1] - '0',
dev->dev.parent,
- "%s %s %s: %pV",
+ "%s %s %s%s: %pV",
dev_driver_string(dev->dev.parent),
dev_name(dev->dev.parent),
- netdev_name(dev), vaf);
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
} else if (dev) {
- r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+ r = printk("%s%s%s: %pV", level, netdev_name(dev),
+ netdev_reg_state(dev), vaf);
} else {
r = printk("%s(NULL net_device): %pV", level, vaf);
}
@@ -6938,14 +7177,13 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
else
unregister_netdevice_queue(dev, &dev_kill_list);
}
}
unregister_netdevice_many(&dev_kill_list);
- list_del(&dev_kill_list);
rtnl_unlock();
}
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 329d5794e7dc..b6b230600b97 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -225,6 +225,91 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
}
EXPORT_SYMBOL(__hw_addr_unsync);
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchonized addresses from device
+ * @list: address list to remove syncronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index e70301eb7a4a..50f9a9db5792 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
switch (info->genlhdr->cmd) {
case NET_DM_CMD_START:
return set_all_monitor_traces(TRACE_ON);
- break;
case NET_DM_CMD_STOP:
return set_all_monitor_traces(TRACE_OFF);
- break;
}
return -ENOTSUPP;
diff --git a/net/core/dst.c b/net/core/dst.c
index ca4231ec7347..a028409ee438 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -142,12 +142,12 @@ loop:
mutex_unlock(&dst_gc_mutex);
}
-int dst_discard(struct sk_buff *skb)
+int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
-EXPORT_SYMBOL(dst_discard);
+EXPORT_SYMBOL(dst_discard_sk);
const u32 dst_default_metrics[RTAX_MAX + 1] = {
/* This initializer is needed to force linker to place this variable
@@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst->xfrm = NULL;
#endif
dst->input = dst_discard;
- dst->output = dst_discard;
+ dst->output = dst_discard_sk;
dst->error = 0;
dst->obsolete = initial_obsolete;
dst->header_len = 0;
@@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst)
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
- if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
- dst->input = dst->output = dst_discard;
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ }
dst->obsolete = DST_OBSOLETE_DEAD;
}
@@ -267,6 +269,15 @@ again:
}
EXPORT_SYMBOL(dst_destroy);
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+}
+
void dst_release(struct dst_entry *dst)
{
if (dst) {
@@ -274,11 +285,8 @@ void dst_release(struct dst_entry *dst)
newrefcnt = atomic_dec_return(&dst->__refcnt);
WARN_ON(newrefcnt < 0);
- if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
- dst = dst_destroy(dst);
- if (dst)
- __dst_free(dst);
- }
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
@@ -361,7 +369,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
return;
if (!unregister) {
- dst->input = dst->output = dst_discard;
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 30071dec287a..17cb912793fa 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXFCS_BIT] = "rx-fcs",
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
};
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
@@ -556,6 +557,23 @@ err_out:
return ret;
}
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -564,7 +582,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
int ret;
if (!dev->ethtool_ops->get_rxfh_indir_size ||
- !dev->ethtool_ops->get_rxfh_indir)
+ !dev->ethtool_ops->get_rxfh)
return -EOPNOTSUPP;
dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
if (dev_size == 0)
@@ -590,7 +608,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
if (!indir)
return -ENOMEM;
- ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL);
if (ret)
goto out;
@@ -612,8 +630,9 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
u32 *indir;
const struct ethtool_ops *ops = dev->ethtool_ops;
int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
- if (!ops->get_rxfh_indir_size || !ops->set_rxfh_indir ||
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
!ops->get_rxnfc)
return -EOPNOTSUPP;
@@ -642,28 +661,184 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
for (i = 0; i < dev_size; i++)
indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
} else {
- if (copy_from_user(indir,
- useraddr +
- offsetof(struct ethtool_rxfh_indir,
- ring_index[0]),
- dev_size * sizeof(indir[0]))) {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ops->set_rxfh(dev, indir, NULL);
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!(dev->ethtool_ops->get_rxfh_indir_size ||
+ dev->ethtool_ops->get_rxfh_key_size) ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size and key size. Otherwise, if the User size is
+ * not equal to device table size or key size it's an error.
+ */
+ if (!user_indir_size && !user_key_size)
+ return 0;
+
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey);
+ if (!ret) {
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size))
ret = -EFAULT;
+ }
+
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+ if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) ||
+ !ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev);
+ if ((dev_key_size + dev_indir_size) == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1])
+ return -EINVAL;
+
+ /* If either indir or hash key is valid, proceed further.
+ * It is not valid to request that both be unchanged.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0))
+ return -EINVAL;
+
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ /* rxfh.indir_size == 0 means reset the indir table to default.
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
goto out;
- }
+ } else if (rxfh.indir_size == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ }
- /* Validate ring indices */
- for (i = 0; i < dev_size; i++) {
- if (indir[i] >= rx_rings.data) {
- ret = -EINVAL;
- goto out;
- }
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
}
}
- ret = ops->set_rxfh_indir(dev, indir);
+ ret = ops->set_rxfh(dev, indir, hkey);
out:
- kfree(indir);
+ kfree(rss_config);
return ret;
}
@@ -1490,6 +1665,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
case ETHTOOL_GFEATURES:
case ETHTOOL_GCHANNELS:
case ETHTOOL_GET_TS_INFO:
@@ -1627,6 +1803,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SRXFHINDIR:
rc = ethtool_set_rxfh_indir(dev, useraddr);
break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
case ETHTOOL_GFEATURES:
rc = ethtool_get_features(dev, useraddr);
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index ad30d626a5bd..d814b8a89d0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
/*
* Linux Socket Filter - Kernel level socket filtering
*
- * Author:
- * Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
*
- * Based on the design of:
- * - The Berkeley Packet Filter
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -13,7 +18,7 @@
* 2 of the License, or (at your option) any later version.
*
* Andi Kleen - Fix a few bad bugs and races.
- * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
#include <linux/module.h>
@@ -40,32 +45,6 @@
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
-/* No hurry in this branch
- *
- * Exported for the bpf jit load helper.
- */
-void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
-{
- u8 *ptr = NULL;
-
- if (k >= SKF_NET_OFF)
- ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
- ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
- if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
- return ptr;
- return NULL;
-}
-
-static inline void *load_pointer(const struct sk_buff *skb, int k,
- unsigned int size, void *buffer)
-{
- if (k >= 0)
- return skb_header_pointer(skb, k, size, buffer);
- return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
/**
* sk_filter - run a packet through a socket filter
* @sk: sock associated with &sk_buff
@@ -108,349 +87,562 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sk_filter);
+/* Helper to find the offset of pkt_type in sk_buff structure. We want
+ * to make sure its still a 3bit field starting at a byte boundary;
+ * taken from arch/x86/net/bpf_jit_comp.c.
+ */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define PKT_TYPE_MAX (7 << 5)
+#else
+#define PKT_TYPE_MAX 7
+#endif
+static unsigned int pkt_type_offset(void)
+{
+ struct sk_buff skb_probe = { .pkt_type = ~0, };
+ u8 *ct = (u8 *) &skb_probe;
+ unsigned int off;
+
+ for (off = 0; off < sizeof(struct sk_buff); off++) {
+ if (ct[off] == PKT_TYPE_MAX)
+ return off;
+ }
+
+ pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
+ return -1;
+}
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct bpf_insn **insnp)
+{
+ struct bpf_insn *insn = *insnp;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
+ pkt_type_offset());
+ if (insn->off < 0)
+ return false;
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ insn++;
+ *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
+#endif
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_tci));
+ if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* A >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
+ /* A &= 1 */
+ *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(__get_random_u32);
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
/**
- * sk_run_filter - run a filter on a socket
- * @skb: buffer to run the filter on
- * @fentry: filter to apply
+ * bpf_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: buffer where converted program will be stored
+ * @new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
*
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. @skb is the data we are
- * filtering, @filter is the array of filter instructions.
- * Because all jumps are guaranteed to be before last instruction,
- * and last instruction guaranteed to be a RET, we dont need to check
- * flen. (We used to pass to this function the length of filter)
+ * 1) First pass for calculating the new program length:
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
*/
-unsigned int sk_run_filter(const struct sk_buff *skb,
- const struct sock_filter *fentry)
+int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_insn *new_prog, int *new_len)
{
- void *ptr;
- u32 A = 0; /* Accumulator */
- u32 X = 0; /* Index Register */
- u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
- u32 tmp;
- int k;
+ int new_flen = 0, pass = 0, target, i;
+ struct bpf_insn *new_insn;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
- /*
- * Process array of filter instructions.
- */
- for (;; fentry++) {
-#if defined(CONFIG_X86_32)
-#define K (fentry->k)
-#else
- const u32 K = fentry->k;
-#endif
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- switch (fentry->code) {
- case BPF_S_ALU_ADD_X:
- A += X;
- continue;
- case BPF_S_ALU_ADD_K:
- A += K;
- continue;
- case BPF_S_ALU_SUB_X:
- A -= X;
- continue;
- case BPF_S_ALU_SUB_K:
- A -= K;
- continue;
- case BPF_S_ALU_MUL_X:
- A *= X;
- continue;
- case BPF_S_ALU_MUL_K:
- A *= K;
- continue;
- case BPF_S_ALU_DIV_X:
- if (X == 0)
- return 0;
- A /= X;
- continue;
- case BPF_S_ALU_DIV_K:
- A /= K;
- continue;
- case BPF_S_ALU_MOD_X:
- if (X == 0)
- return 0;
- A %= X;
- continue;
- case BPF_S_ALU_MOD_K:
- A %= K;
- continue;
- case BPF_S_ALU_AND_X:
- A &= X;
- continue;
- case BPF_S_ALU_AND_K:
- A &= K;
- continue;
- case BPF_S_ALU_OR_X:
- A |= X;
- continue;
- case BPF_S_ALU_OR_K:
- A |= K;
- continue;
- case BPF_S_ANC_ALU_XOR_X:
- case BPF_S_ALU_XOR_X:
- A ^= X;
- continue;
- case BPF_S_ALU_XOR_K:
- A ^= K;
- continue;
- case BPF_S_ALU_LSH_X:
- A <<= X;
- continue;
- case BPF_S_ALU_LSH_K:
- A <<= K;
- continue;
- case BPF_S_ALU_RSH_X:
- A >>= X;
- continue;
- case BPF_S_ALU_RSH_K:
- A >>= K;
- continue;
- case BPF_S_ALU_NEG:
- A = -A;
- continue;
- case BPF_S_JMP_JA:
- fentry += K;
- continue;
- case BPF_S_JMP_JGT_K:
- fentry += (A > K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_K:
- fentry += (A >= K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_K:
- fentry += (A == K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_K:
- fentry += (A & K) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGT_X:
- fentry += (A > X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JGE_X:
- fentry += (A >= X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JEQ_X:
- fentry += (A == X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_JMP_JSET_X:
- fentry += (A & X) ? fentry->jt : fentry->jf;
- continue;
- case BPF_S_LD_W_ABS:
- k = K;
-load_w:
- ptr = load_pointer(skb, k, 4, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be32(ptr);
- continue;
- }
- return 0;
- case BPF_S_LD_H_ABS:
- k = K;
-load_h:
- ptr = load_pointer(skb, k, 2, &tmp);
- if (ptr != NULL) {
- A = get_unaligned_be16(ptr);
- continue;
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = new_prog;
+ fp = prog;
+
+ if (new_insn)
+ *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ new_insn++;
+
+ for (i = 0; i < len; fp++, i++) {
+ struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - new_prog;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ if (target >= len || target < 0) \
+ goto err; \
+ insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ insn->off -= insn - tmp_insns; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_X;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
}
- return 0;
- case BPF_S_LD_B_ABS:
- k = K;
-load_b:
- ptr = load_pointer(skb, k, 1, &tmp);
- if (ptr != NULL) {
- A = *(u8 *)ptr;
- continue;
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
}
- return 0;
- case BPF_S_LD_W_LEN:
- A = skb->len;
- continue;
- case BPF_S_LDX_W_LEN:
- X = skb->len;
- continue;
- case BPF_S_LD_W_IND:
- k = X + K;
- goto load_w;
- case BPF_S_LD_H_IND:
- k = X + K;
- goto load_h;
- case BPF_S_LD_B_IND:
- k = X + K;
- goto load_b;
- case BPF_S_LDX_B_MSH:
- ptr = load_pointer(skb, K, 1, &tmp);
- if (ptr != NULL) {
- X = (*(u8 *)ptr & 0xf) << 2;
- continue;
+
+ /* Convert JEQ into JNE when 'jump_true' is next insn. */
+ if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
}
- return 0;
- case BPF_S_LD_IMM:
- A = K;
- continue;
- case BPF_S_LDX_IMM:
- X = K;
- continue;
- case BPF_S_LD_MEM:
- A = mem[K];
- continue;
- case BPF_S_LDX_MEM:
- X = mem[K];
- continue;
- case BPF_S_MISC_TAX:
- X = A;
- continue;
- case BPF_S_MISC_TXA:
- A = X;
- continue;
- case BPF_S_RET_K:
- return K;
- case BPF_S_RET_A:
- return A;
- case BPF_S_ST:
- mem[K] = A;
- continue;
- case BPF_S_STX:
- mem[K] = X;
- continue;
- case BPF_S_ANC_PROTOCOL:
- A = ntohs(skb->protocol);
- continue;
- case BPF_S_ANC_PKTTYPE:
- A = skb->pkt_type;
- continue;
- case BPF_S_ANC_IFINDEX:
- if (!skb->dev)
- return 0;
- A = skb->dev->ifindex;
- continue;
- case BPF_S_ANC_MARK:
- A = skb->mark;
- continue;
- case BPF_S_ANC_QUEUE:
- A = skb->queue_mapping;
- continue;
- case BPF_S_ANC_HATYPE:
- if (!skb->dev)
- return 0;
- A = skb->dev->type;
- continue;
- case BPF_S_ANC_RXHASH:
- A = skb->rxhash;
- continue;
- case BPF_S_ANC_CPU:
- A = raw_smp_processor_id();
- continue;
- case BPF_S_ANC_VLAN_TAG:
- A = vlan_tx_tag_get(skb);
- continue;
- case BPF_S_ANC_VLAN_TAG_PRESENT:
- A = !!vlan_tx_tag_present(skb);
- continue;
- case BPF_S_ANC_PAY_OFFSET:
- A = __skb_get_poff(skb);
- continue;
- case BPF_S_ANC_NLATTR: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = nla_find((struct nlattr *)&skb->data[A],
- skb->len - A, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
- }
- case BPF_S_ANC_NLATTR_NEST: {
- struct nlattr *nla;
-
- if (skb_is_nonlinear(skb))
- return 0;
- if (A > skb->len - sizeof(struct nlattr))
- return 0;
-
- nla = (struct nlattr *)&skb->data[A];
- if (nla->nla_len > A - skb->len)
- return 0;
-
- nla = nla_find_nested(nla, X);
- if (nla)
- A = (void *)nla - (void *)skb->data;
- else
- A = 0;
- continue;
- }
-#ifdef CONFIG_SECCOMP_FILTER
- case BPF_S_ANC_SECCOMP_LD_W:
- A = seccomp_bpf_load(fentry->k);
- continue;
-#endif
+
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B:
+ /* tmp = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+
+ /* RET_K, RET_A are remaped into 2 insns. */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+ BPF_K : BPF_X, BPF_REG_0,
+ BPF_REG_A, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
+ case BPF_ST:
+ case BPF_STX:
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unkown instruction. */
default:
- WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
- fentry->code, fentry->jt,
- fentry->jf, fentry->k);
- return 0;
+ goto err;
}
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - new_prog;
+ return 0;
+ }
+
+ pass++;
+ if (new_flen != new_insn - new_prog) {
+ new_flen = new_insn - new_prog;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
}
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
}
-EXPORT_SYMBOL(sk_run_filter);
-/*
- * Security :
+/* Security:
+ *
* A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ *
* As we dont want to clear mem[] array for each packet going through
* sk_run_filter(), we check that filter loaded by user never try to read
* a cell if not previously written, and we check all branches to be sure
* a malicious user doesn't try to abuse us.
*/
-static int check_load_and_stores(struct sock_filter *filter, int flen)
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
- u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
int pc, ret = 0;
BUILD_BUG_ON(BPF_MEMWORDS > 16);
- masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
if (!masks)
return -ENOMEM;
+
memset(masks, 0xff, flen * sizeof(*masks));
for (pc = 0; pc < flen; pc++) {
memvalid &= masks[pc];
switch (filter[pc].code) {
- case BPF_S_ST:
- case BPF_S_STX:
+ case BPF_ST:
+ case BPF_STX:
memvalid |= (1 << filter[pc].k);
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
if (!(memvalid & (1 << filter[pc].k))) {
ret = -EINVAL;
goto error;
}
break;
- case BPF_S_JMP_JA:
- /* a jump must set masks on target */
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
masks[pc + 1 + filter[pc].k] &= memvalid;
memvalid = ~0;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* a jump must set masks on targets */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
masks[pc + 1 + filter[pc].jt] &= memvalid;
masks[pc + 1 + filter[pc].jf] &= memvalid;
memvalid = ~0;
@@ -462,8 +654,74 @@ error:
return ret;
}
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
/**
- * sk_chk_filter - verify socket filter code
+ * bpf_check_classic - verify socket filter code
* @filter: filter to verify
* @flen: length of filter
*
@@ -476,231 +734,315 @@ error:
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
{
- /*
- * Valid instructions are initialized to non-0.
- * Invalid instructions are initialized to 0.
- */
- static const u8 codes[] = {
- [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K,
- [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X,
- [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K,
- [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X,
- [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K,
- [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X,
- [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X,
- [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K,
- [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X,
- [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K,
- [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X,
- [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K,
- [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X,
- [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K,
- [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X,
- [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K,
- [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X,
- [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K,
- [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X,
- [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG,
- [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS,
- [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS,
- [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS,
- [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN,
- [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND,
- [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND,
- [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND,
- [BPF_LD|BPF_IMM] = BPF_S_LD_IMM,
- [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN,
- [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH,
- [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM,
- [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX,
- [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA,
- [BPF_RET|BPF_K] = BPF_S_RET_K,
- [BPF_RET|BPF_A] = BPF_S_RET_A,
- [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K,
- [BPF_LD|BPF_MEM] = BPF_S_LD_MEM,
- [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM,
- [BPF_ST] = BPF_S_ST,
- [BPF_STX] = BPF_S_STX,
- [BPF_JMP|BPF_JA] = BPF_S_JMP_JA,
- [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K,
- [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X,
- [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K,
- [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X,
- [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K,
- [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X,
- [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
- [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
- };
- int pc;
bool anc_found;
+ int pc;
if (flen == 0 || flen > BPF_MAXINSNS)
return -EINVAL;
- /* check the filter code now */
+ /* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
- struct sock_filter *ftest = &filter[pc];
- u16 code = ftest->code;
+ const struct sock_filter *ftest = &filter[pc];
- if (code >= ARRAY_SIZE(codes))
- return -EINVAL;
- code = codes[code];
- if (!code)
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
return -EINVAL;
+
/* Some instructions need special checks */
- switch (code) {
- case BPF_S_ALU_DIV_K:
- case BPF_S_ALU_MOD_K:
- /* check for division by zero */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
if (ftest->k == 0)
return -EINVAL;
break;
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
- case BPF_S_ST:
- case BPF_S_STX:
- /* check for invalid memory addresses */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* Check for invalid memory addresses */
if (ftest->k >= BPF_MEMWORDS)
return -EINVAL;
break;
- case BPF_S_JMP_JA:
- /*
- * Note, the large ftest->k might cause loops.
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
* Compare this with conditional jumps below,
* where offsets are limited. --ANK (981016)
*/
- if (ftest->k >= (unsigned int)(flen-pc-1))
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
return -EINVAL;
break;
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_X:
- case BPF_S_JMP_JSET_K:
- /* for conditionals both must be safe */
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
if (pc + ftest->jt + 1 >= flen ||
pc + ftest->jf + 1 >= flen)
return -EINVAL;
break;
- case BPF_S_LD_W_ABS:
- case BPF_S_LD_H_ABS:
- case BPF_S_LD_B_ABS:
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
anc_found = false;
-#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \
- code = BPF_S_ANC_##CODE; \
- anc_found = true; \
- break
- switch (ftest->k) {
- ANCILLARY(PROTOCOL);
- ANCILLARY(PKTTYPE);
- ANCILLARY(IFINDEX);
- ANCILLARY(NLATTR);
- ANCILLARY(NLATTR_NEST);
- ANCILLARY(MARK);
- ANCILLARY(QUEUE);
- ANCILLARY(HATYPE);
- ANCILLARY(RXHASH);
- ANCILLARY(CPU);
- ANCILLARY(ALU_XOR_X);
- ANCILLARY(VLAN_TAG);
- ANCILLARY(VLAN_TAG_PRESENT);
- ANCILLARY(PAY_OFFSET);
- }
-
- /* ancillary operation unknown or unsupported */
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
if (anc_found == false && ftest->k >= SKF_AD_OFF)
return -EINVAL;
}
- ftest->code = code;
}
- /* last instruction must be a RET code */
+ /* Last instruction must be a RET code */
switch (filter[flen - 1].code) {
- case BPF_S_RET_K:
- case BPF_S_RET_A:
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
return check_load_and_stores(filter, flen);
}
+
return -EINVAL;
}
-EXPORT_SYMBOL(sk_chk_filter);
+EXPORT_SYMBOL(bpf_check_classic);
+
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+ fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void bpf_release_orig_filter(struct bpf_prog *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+}
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+ __bpf_prog_release(fp->prog);
+ kfree(fp);
+}
/**
* sk_filter_release_rcu - Release a socket filter by rcu_head
* @rcu: rcu_head that contains the sk_filter to free
*/
-void sk_filter_release_rcu(struct rcu_head *rcu)
+static void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
- bpf_jit_free(fp);
+ __sk_filter_release(fp);
}
-EXPORT_SYMBOL(sk_filter_release_rcu);
-static int __sk_prepare_filter(struct sk_filter *fp)
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
{
- int err;
+ if (atomic_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ atomic_sub(filter_size, &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ /* same check as in sock_kmalloc() */
+ if (filter_size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ atomic_inc(&fp->refcnt);
+ atomic_add(filter_size, &sk->sk_omem_alloc);
+ return true;
+ }
+ return false;
+}
+
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
+{
+ struct sock_filter *old_prog;
+ struct bpf_prog *old_fp;
+ int err, new_len, old_len = fp->len;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct bpf_insn));
- fp->bpf_func = sk_run_filter;
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
- err = sk_chk_filter(fp->insns, fp->len);
+ /* 1st pass: calculate the new program length. */
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
if (err)
- return err;
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL);
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
+ err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ if (err)
+ /* 2nd bpf_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by krealloc().
+ */
+ goto out_err_free;
+
+ bpf_prog_select_runtime(fp);
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+}
+
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = 0;
+ err = bpf_check_classic(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
bpf_jit_compile(fp);
- return 0;
+
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = bpf_migrate_filter(fp);
+
+ return fp;
}
/**
- * sk_unattached_filter_create - create an unattached filter
- * @fprog: the filter program
+ * bpf_prog_create - create an unattached filter
* @pfp: the unattached filter that is created
+ * @fprog: the filter program
*
* Create a filter independent of any socket. We first run some
* sanity checks on it to make sure it does not explode on us later.
* If an error occurs or there is insufficient memory for the filter
* a negative errno code is returned. On success the return is zero.
*/
-int sk_unattached_filter_create(struct sk_filter **pfp,
- struct sock_fprog *fprog)
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
- struct sk_filter *fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
- int err;
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return -EINVAL;
- fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
+ fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL);
if (!fp)
return -ENOMEM;
+
memcpy(fp->insns, fprog->filter, fsize);
- atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
- err = __sk_prepare_filter(fp);
- if (err)
- goto free_mem;
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
*pfp = fp;
return 0;
-free_mem:
- kfree(fp);
- return err;
}
-EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
+EXPORT_SYMBOL_GPL(bpf_prog_create);
-void sk_unattached_filter_destroy(struct sk_filter *fp)
+void bpf_prog_destroy(struct bpf_prog *fp)
{
- sk_filter_release(fp);
+ __bpf_prog_release(fp);
}
-EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
/**
* sk_attach_filter - attach a socket filter
@@ -715,8 +1057,9 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
- unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
- unsigned int sk_fsize = sk_filter_size(fprog->len);
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ unsigned int bpf_fsize = bpf_prog_size(fprog->len);
+ struct bpf_prog *prog;
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
@@ -726,21 +1069,42 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (fprog->filter == NULL)
return -EINVAL;
- fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
- if (!fp)
+ prog = kmalloc(bpf_fsize, GFP_KERNEL);
+ if (!prog)
return -ENOMEM;
- if (copy_from_user(fp->insns, fprog->filter, fsize)) {
- sock_kfree_s(sk, fp, sk_fsize);
+
+ if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+ kfree(prog);
return -EFAULT;
}
- atomic_set(&fp->refcnt, 1);
- fp->len = fprog->len;
+ prog->len = fprog->len;
- err = __sk_prepare_filter(fp);
+ err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
- sk_filter_uncharge(sk, fp);
- return err;
+ kfree(prog);
+ return -ENOMEM;
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ prog = bpf_prepare_filter(prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp) {
+ __bpf_prog_release(prog);
+ return -ENOMEM;
+ }
+ fp->prog = prog;
+
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ __sk_filter_release(fp);
+ return -ENOMEM;
}
old_fp = rcu_dereference_protected(sk->sk_filter,
@@ -749,6 +1113,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (old_fp)
sk_filter_uncharge(sk, old_fp);
+
return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
@@ -768,116 +1133,46 @@ int sk_detach_filter(struct sock *sk)
sk_filter_uncharge(sk, filter);
ret = 0;
}
+
return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
-{
- static const u16 decodes[] = {
- [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K,
- [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X,
- [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K,
- [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X,
- [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K,
- [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X,
- [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X,
- [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K,
- [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X,
- [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K,
- [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X,
- [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K,
- [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X,
- [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K,
- [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X,
- [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K,
- [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X,
- [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K,
- [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X,
- [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG,
- [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS,
- [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS,
- [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS,
- [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN,
- [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND,
- [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND,
- [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND,
- [BPF_S_LD_IMM] = BPF_LD|BPF_IMM,
- [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN,
- [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH,
- [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM,
- [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX,
- [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA,
- [BPF_S_RET_K] = BPF_RET|BPF_K,
- [BPF_S_RET_A] = BPF_RET|BPF_A,
- [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K,
- [BPF_S_LD_MEM] = BPF_LD|BPF_MEM,
- [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM,
- [BPF_S_ST] = BPF_ST,
- [BPF_S_STX] = BPF_STX,
- [BPF_S_JMP_JA] = BPF_JMP|BPF_JA,
- [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K,
- [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X,
- [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K,
- [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X,
- [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K,
- [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X,
- [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K,
- [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X,
- };
- u16 code;
-
- code = filt->code;
-
- to->code = decodes[code];
- to->jt = filt->jt;
- to->jf = filt->jf;
- to->k = filt->k;
-}
-
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len)
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
{
+ struct sock_fprog_kern *fprog;
struct sk_filter *filter;
- int i, ret;
+ int ret = 0;
lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
- ret = 0;
+ sock_owned_by_user(sk));
if (!filter)
goto out;
- ret = filter->len;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore.
+ */
+ fprog = filter->prog->orig_prog;
+
+ ret = fprog->len;
if (!len)
+ /* User space only enquires number of filter blocks. */
goto out;
+
ret = -EINVAL;
- if (len < filter->len)
+ if (len < fprog->len)
goto out;
ret = -EFAULT;
- for (i = 0; i < filter->len; i++) {
- struct sock_filter fb;
-
- sk_decode_filter(&filter->insns[i], &fb);
- if (copy_to_user(&ubuf[i], &fb, sizeof(fb)))
- goto out;
- }
+ if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ goto out;
- ret = filter->len;
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
out:
release_sock(sk);
return ret;
diff --git a/net/core/flow.c b/net/core/flow.c
index dfa602ceb8cd..a0348fde1fdf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -24,6 +24,7 @@
#include <net/flow.h>
#include <linux/atomic.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
struct flow_cache_entry {
union {
@@ -38,37 +39,14 @@ struct flow_cache_entry {
struct flow_cache_object *object;
};
-struct flow_cache_percpu {
- struct hlist_head *hash_table;
- int hash_count;
- u32 hash_rnd;
- int hash_rnd_recalc;
- struct tasklet_struct flush_tasklet;
-};
-
struct flow_flush_info {
struct flow_cache *cache;
atomic_t cpuleft;
struct completion completion;
};
-struct flow_cache {
- u32 hash_shift;
- struct flow_cache_percpu __percpu *percpu;
- struct notifier_block hotcpu_notifier;
- int low_watermark;
- int high_watermark;
- struct timer_list rnd_timer;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-EXPORT_SYMBOL(flow_cache_genid);
-static struct flow_cache flow_cache_global;
static struct kmem_cache *flow_cachep __read_mostly;
-static DEFINE_SPINLOCK(flow_cache_gc_lock);
-static LIST_HEAD(flow_cache_gc_list);
-
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
@@ -84,16 +62,18 @@ static void flow_cache_new_hashrnd(unsigned long arg)
add_timer(&fc->rnd_timer);
}
-static int flow_entry_valid(struct flow_cache_entry *fle)
+static int flow_entry_valid(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
- if (atomic_read(&flow_cache_genid) != fle->genid)
+ if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
return 0;
if (fle->object && !fle->object->ops->check(fle->object))
return 0;
return 1;
}
-static void flow_entry_kill(struct flow_cache_entry *fle)
+static void flow_entry_kill(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
{
if (fle->object)
fle->object->ops->delete(fle->object);
@@ -104,26 +84,28 @@ static void flow_cache_gc_task(struct work_struct *work)
{
struct list_head gc_list;
struct flow_cache_entry *fce, *n;
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
INIT_LIST_HEAD(&gc_list);
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail_init(&flow_cache_gc_list, &gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
- flow_entry_kill(fce);
+ flow_entry_kill(fce, xfrm);
}
-static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
- int deleted, struct list_head *gc_list)
+ int deleted, struct list_head *gc_list,
+ struct netns_xfrm *xfrm)
{
if (deleted) {
fcp->hash_count -= deleted;
- spin_lock_bh(&flow_cache_gc_lock);
- list_splice_tail(gc_list, &flow_cache_gc_list);
- spin_unlock_bh(&flow_cache_gc_lock);
- schedule_work(&flow_cache_gc_work);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+ schedule_work(&xfrm->flow_cache_gc_work);
}
}
@@ -135,6 +117,8 @@ static void __flow_cache_shrink(struct flow_cache *fc,
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
int saved = 0;
@@ -142,7 +126,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
if (saved < shrink_to &&
- flow_entry_valid(fle)) {
+ flow_entry_valid(fle, xfrm)) {
saved++;
} else {
deleted++;
@@ -152,7 +136,7 @@ static void __flow_cache_shrink(struct flow_cache *fc,
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
}
static void flow_cache_shrink(struct flow_cache *fc,
@@ -208,7 +192,7 @@ struct flow_cache_object *
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver, void *ctx)
{
- struct flow_cache *fc = &flow_cache_global;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle, *tfle;
struct flow_cache_object *flo;
@@ -258,7 +242,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
fcp->hash_count++;
}
- } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+ } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
flo = fle->object;
if (!flo)
goto ret_object;
@@ -279,7 +263,7 @@ nocache:
}
flo = resolver(net, key, family, dir, flo, ctx);
if (fle) {
- fle->genid = atomic_read(&flow_cache_genid);
+ fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
if (!IS_ERR(flo))
fle->object = flo;
else
@@ -303,12 +287,14 @@ static void flow_cache_flush_tasklet(unsigned long data)
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
fcp = this_cpu_ptr(fc->percpu);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
- if (flow_entry_valid(fle))
+ if (flow_entry_valid(fle, xfrm))
continue;
deleted++;
@@ -317,7 +303,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
}
}
- flow_cache_queue_garbage(fcp, deleted, &gc_list);
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
@@ -351,10 +337,9 @@ static void flow_cache_flush_per_cpu(void *data)
tasklet_schedule(tasklet);
}
-void flow_cache_flush(void)
+void flow_cache_flush(struct net *net)
{
struct flow_flush_info info;
- static DEFINE_MUTEX(flow_flush_sem);
cpumask_var_t mask;
int i, self;
@@ -365,8 +350,8 @@ void flow_cache_flush(void)
/* Don't want cpus going down or up during this. */
get_online_cpus();
- mutex_lock(&flow_flush_sem);
- info.cache = &flow_cache_global;
+ mutex_lock(&net->xfrm.flow_flush_sem);
+ info.cache = &net->xfrm.flow_cache_global;
for_each_online_cpu(i)
if (!flow_cache_percpu_empty(info.cache, i))
cpumask_set_cpu(i, mask);
@@ -386,21 +371,23 @@ void flow_cache_flush(void)
wait_for_completion(&info.completion);
done:
- mutex_unlock(&flow_flush_sem);
+ mutex_unlock(&net->xfrm.flow_flush_sem);
put_online_cpus();
free_cpumask_var(mask);
}
static void flow_cache_flush_task(struct work_struct *work)
{
- flow_cache_flush();
-}
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+ struct net *net = container_of(xfrm, struct net, xfrm);
-static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+ flow_cache_flush(net);
+}
-void flow_cache_flush_deferred(void)
+void flow_cache_flush_deferred(struct net *net)
{
- schedule_work(&flow_cache_flush_work);
+ schedule_work(&net->xfrm.flow_cache_flush_work);
}
static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
@@ -425,7 +412,8 @@ static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+ struct flow_cache *fc = container_of(nfb, struct flow_cache,
+ hotcpu_notifier);
int res, cpu = (unsigned long) hcpu;
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
@@ -444,9 +432,20 @@ static int flow_cache_cpu(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static int __init flow_cache_init(struct flow_cache *fc)
+int flow_cache_init(struct net *net)
{
int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+ if (!flow_cachep)
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC, NULL);
+ spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+ INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+ INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+ INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+ mutex_init(&net->xfrm.flow_flush_sem);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
@@ -456,6 +455,8 @@ static int __init flow_cache_init(struct flow_cache *fc)
if (!fc->percpu)
return -ENOMEM;
+ cpu_notifier_register_begin();
+
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
goto err;
@@ -463,7 +464,9 @@ static int __init flow_cache_init(struct flow_cache *fc)
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
- register_hotcpu_notifier(&fc->hotcpu_notifier);
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpu_notifier_register_done();
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -479,19 +482,30 @@ err:
fcp->hash_table = NULL;
}
+ cpu_notifier_register_done();
+
free_percpu(fc->percpu);
fc->percpu = NULL;
return -ENOMEM;
}
+EXPORT_SYMBOL(flow_cache_init);
-static int __init flow_cache_init_global(void)
+void flow_cache_fini(struct net *net)
{
- flow_cachep = kmem_cache_create("flow_cache",
- sizeof(struct flow_cache_entry),
- 0, SLAB_PANIC, NULL);
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
- return flow_cache_init(&flow_cache_global);
-}
+ del_timer_sync(&fc->rnd_timer);
+ unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
-module_init(flow_cache_init_global);
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+}
+EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e29e810663d7..5f362c1d0332 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -61,7 +61,7 @@ bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
again:
switch (proto) {
- case __constant_htons(ETH_P_IP): {
+ case htons(ETH_P_IP): {
const struct iphdr *iph;
struct iphdr _iph;
ip:
@@ -77,9 +77,11 @@ ip:
iph_to_flow_copy_addrs(flow, iph);
break;
}
- case __constant_htons(ETH_P_IPV6): {
+ case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
+ __be32 flow_label;
+
ipv6:
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
if (!iph)
@@ -89,10 +91,25 @@ ipv6:
flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
nhoff += sizeof(struct ipv6hdr);
+
+ flow_label = ip6_flowlabel(iph);
+ if (flow_label) {
+ /* Awesome, IPv6 packet has a flow label so we can
+ * use that to represent the ports without any
+ * further dissection.
+ */
+ flow->n_proto = proto;
+ flow->ip_proto = ip_proto;
+ flow->ports = flow_label;
+ flow->thoff = (u16)nhoff;
+
+ return true;
+ }
+
break;
}
- case __constant_htons(ETH_P_8021AD):
- case __constant_htons(ETH_P_8021Q): {
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
@@ -104,7 +121,7 @@ ipv6:
nhoff += sizeof(*vlan);
goto again;
}
- case __constant_htons(ETH_P_PPP_SES): {
+ case htons(ETH_P_PPP_SES): {
struct {
struct pppoe_hdr hdr;
__be16 proto;
@@ -115,9 +132,9 @@ ipv6:
proto = hdr->proto;
nhoff += PPPOE_SES_HLEN;
switch (proto) {
- case __constant_htons(PPP_IP):
+ case htons(PPP_IP):
goto ip;
- case __constant_htons(PPP_IPV6):
+ case htons(PPP_IPV6):
goto ipv6;
default:
return false;
@@ -175,6 +192,7 @@ ipv6:
break;
}
+ flow->n_proto = proto;
flow->ip_proto = ip_proto;
flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
flow->thoff = (u16) nhoff;
@@ -195,44 +213,52 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
return jhash_3words(a, b, c, hashrnd);
}
-static __always_inline u32 __flow_hash_1word(u32 a)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
{
- __flow_hash_secret_init();
- return jhash_1word(a, hashrnd);
+ u32 hash;
+
+ /* get a consistent hash (same value on both flow directions) */
+ if (((__force u32)keys->dst < (__force u32)keys->src) ||
+ (((__force u32)keys->dst == (__force u32)keys->src) &&
+ ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
+ swap(keys->dst, keys->src);
+ swap(keys->port16[0], keys->port16[1]);
+ }
+
+ hash = __flow_hash_3words((__force u32)keys->dst,
+ (__force u32)keys->src,
+ (__force u32)keys->ports);
+ if (!hash)
+ hash = 1;
+
+ return hash;
}
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+ return __flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
/*
* __skb_get_hash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
- * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
- u32 hash;
if (!skb_flow_dissect(skb, &keys))
return;
if (keys.ports)
- skb->l4_rxhash = 1;
-
- /* get a consistent hash (same value on both flow directions) */
- if (((__force u32)keys.dst < (__force u32)keys.src) ||
- (((__force u32)keys.dst == (__force u32)keys.src) &&
- ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
- swap(keys.dst, keys.src);
- swap(keys.port16[0], keys.port16[1]);
- }
+ skb->l4_hash = 1;
- hash = __flow_hash_3words((__force u32)keys.dst,
- (__force u32)keys.src,
- (__force u32)keys.ports);
- if (!hash)
- hash = 1;
+ skb->sw_hash = 1;
- skb->rxhash = hash;
+ skb->hash = __flow_hash_from_keys(&keys);
}
EXPORT_SYMBOL(__skb_get_hash);
@@ -240,7 +266,7 @@ EXPORT_SYMBOL(__skb_get_hash);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
@@ -260,13 +286,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
qcount = dev->tc_to_txq[tc].count;
}
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol;
- hash = __flow_hash_1word(hash);
-
- return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+ return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
@@ -338,17 +358,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
- else {
- u32 hash;
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol ^
- skb->rxhash;
- hash = __flow_hash_1word(hash);
+ else
queue_index = map->queues[
- ((u64)hash * map->len) >> 32];
- }
+ ((u64)skb_get_hash(skb) * map->len) >> 32];
+
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 6b5b6e7013ca..9d33dfffca19 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -197,7 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
* as destination. A new timer with the interval specified in the
* configuration TLV is created. Upon each interval, the latest statistics
* will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
+ * &rate_est with the statistics lock grabbed during this period.
*
* Returns 0 on success or a negative error code.
*
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 9d3d9e78397b..2ddbce4cce14 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -206,7 +206,7 @@ EXPORT_SYMBOL(gnet_stats_copy_queue);
* @st: application specific statistics data
* @len: length of data
*
- * Appends the application sepecific statistics to the top level TLV created by
+ * Appends the application specific statistics to the top level TLV created by
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
* handle is in backward compatibility mode.
*
diff --git a/net/core/iovec.c b/net/core/iovec.c
index b61869429f4c..e1ec45ab1e63 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -39,7 +39,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
{
int size, ct, err;
- if (m->msg_namelen) {
+ if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
void __user *namep;
namep = (void __user __force *) m->msg_name;
@@ -48,10 +48,10 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
if (err < 0)
return err;
}
- if (m->msg_name)
- m->msg_name = address;
+ m->msg_name = address;
} else {
m->msg_name = NULL;
+ m->msg_namelen = 0;
}
size = m->msg_iovlen * sizeof(struct iovec);
@@ -75,61 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
}
/*
- * Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
- int offset, int len)
-{
- int copy;
- for (; len > 0; ++iov) {
- /* Skip over the finished iovecs */
- if (unlikely(offset >= iov->iov_len)) {
- offset -= iov->iov_len;
- continue;
- }
- copy = min_t(unsigned int, iov->iov_len - offset, len);
- if (copy_to_user(iov->iov_base + offset, kdata, copy))
- return -EFAULT;
- offset = 0;
- kdata += copy;
- len -= copy;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
-/*
- * Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
- int offset, int len)
-{
- /* Skip over the finished iovecs */
- while (offset >= iov->iov_len) {
- offset -= iov->iov_len;
- iov++;
- }
-
- while (len > 0) {
- u8 __user *base = iov->iov_base + offset;
- int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
- offset = 0;
- if (copy_from_user(kdata, base, copy))
- return -EFAULT;
- len -= copy;
- kdata += copy;
- iov++;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-
-/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 9c3a839322ba..bd0767e6b2b3 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev)
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e16129019c66..ef31fef25e5a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -836,10 +836,10 @@ out:
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE) ?
- NEIGH_VAR(p, UCAST_PROBES) :
- NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
- NEIGH_VAR(p, MCAST_PROBES);
+ int max_probes = NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES);
+ if (!(n->nud_state & NUD_PROBE))
+ max_probes += NEIGH_VAR(p, MCAST_PROBES);
+ return max_probes;
}
static void neigh_invalidate(struct neighbour *neigh)
@@ -945,6 +945,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh_invalidate(neigh);
+ goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
@@ -1247,8 +1248,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->updated = jiffies;
if (!(neigh->nud_state & NUD_FAILED))
return;
- neigh->nud_state = NUD_PROBE;
- atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES));
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
}
@@ -2248,7 +2249,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
- ndm->ndm_type = NDA_DST;
+ ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev->ifindex;
ndm->ndm_state = NUD_NONE;
@@ -3058,11 +3059,12 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
+ struct neigh_table *tbl = p->tbl;
dev_name_source = "default";
- t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
- t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
- t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
- t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
if (handler) {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 93886246a0b4..9dd06699b09c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev,
struct device_attribute *attr, char *buf,
ssize_t (*format)(const struct net_device *, char *))
{
- struct net_device *net = to_net_dev(dev);
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = (*format)(net, buf);
+ if (dev_isalive(ndev))
+ ret = (*format)(ndev, buf);
read_unlock(&dev_base_lock);
return ret;
@@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev,
/* generate a show function for simple field */
#define NETDEVICE_SHOW(field, format_string) \
-static ssize_t format_##field(const struct net_device *net, char *buf) \
+static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, net->field); \
+ return sprintf(buf, format_string, dev->field); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -104,6 +104,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
NETDEVICE_SHOW_RO(addr_len, fmt_dec);
NETDEVICE_SHOW_RO(iflink, fmt_dec);
@@ -111,16 +112,35 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec);
NETDEVICE_SHOW_RO(type, fmt_dec);
NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
+{
+ return sprintf(buf, fmt_dec, dev->name_assign_type);
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(name_assign_type);
+
/* use same locking rules as GIFHWADDR ioctl's */
static ssize_t address_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct net_device *net = to_net_dev(dev);
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
read_unlock(&dev_base_lock);
return ret;
}
@@ -129,18 +149,18 @@ static DEVICE_ATTR_RO(address);
static ssize_t broadcast_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct net_device *net = to_net_dev(dev);
- if (dev_isalive(net))
- return sysfs_format_mac(buf, net->broadcast, net->addr_len);
+ struct net_device *ndev = to_net_dev(dev);
+ if (dev_isalive(ndev))
+ return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
return -EINVAL;
}
static DEVICE_ATTR_RO(broadcast);
-static int change_carrier(struct net_device *net, unsigned long new_carrier)
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
{
- if (!netif_running(net))
+ if (!netif_running(dev))
return -EINVAL;
- return dev_change_carrier(net, (bool) new_carrier);
+ return dev_change_carrier(dev, (bool) new_carrier);
}
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
@@ -252,11 +272,21 @@ static ssize_t operstate_show(struct device *dev,
}
static DEVICE_ATTR_RO(operstate);
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_changes));
+}
+static DEVICE_ATTR_RO(carrier_changes);
+
/* read-write attributes */
-static int change_mtu(struct net_device *net, unsigned long new_mtu)
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
{
- return dev_set_mtu(net, (int) new_mtu);
+ return dev_set_mtu(dev, (int) new_mtu);
}
static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
@@ -266,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RW(mtu, fmt_dec);
-static int change_flags(struct net_device *net, unsigned long new_flags)
+static int change_flags(struct net_device *dev, unsigned long new_flags)
{
- return dev_change_flags(net, (unsigned int) new_flags);
+ return dev_change_flags(dev, (unsigned int) new_flags);
}
static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
@@ -278,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RW(flags, fmt_hex);
-static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- net->tx_queue_len = new_len;
+ dev->tx_queue_len = new_len;
return 0;
}
@@ -333,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev,
}
static DEVICE_ATTR_RW(ifalias);
-static int change_group(struct net_device *net, unsigned long new_group)
+static int change_group(struct net_device *dev, unsigned long new_group)
{
- dev_set_group(net, (int) new_group);
+ dev_set_group(dev, (int) new_group);
return 0;
}
@@ -373,8 +403,10 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
&dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
&dev_attr_iflink.attr,
&dev_attr_ifindex.attr,
+ &dev_attr_name_assign_type.attr,
&dev_attr_addr_assign_type.attr,
&dev_attr_addr_len.attr,
&dev_attr_link_mode.attr,
@@ -384,6 +416,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_duplex.attr,
&dev_attr_dormant.attr,
&dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
&dev_attr_ifalias.attr,
&dev_attr_carrier.attr,
&dev_attr_mtu.attr,
@@ -763,20 +796,20 @@ static struct kobj_type rx_queue_ktype = {
.namespace = rx_queue_namespace
};
-static int rx_queue_add_kobject(struct net_device *net, int index)
+static int rx_queue_add_kobject(struct net_device *dev, int index)
{
- struct netdev_rx_queue *queue = net->_rx + index;
+ struct netdev_rx_queue *queue = dev->_rx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
- kobj->kset = net->queues_kset;
+ kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error)
goto exit;
- if (net->sysfs_rx_queue_group) {
- error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+ if (dev->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error)
goto exit;
}
@@ -789,21 +822,21 @@ exit:
kobject_put(kobj);
return error;
}
-#endif /* CONFIG_SYFS */
+#endif /* CONFIG_SYSFS */
int
-net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
{
#ifdef CONFIG_SYSFS
int i;
int error = 0;
#ifndef CONFIG_RPS
- if (!net->sysfs_rx_queue_group)
+ if (!dev->sysfs_rx_queue_group)
return 0;
#endif
for (i = old_num; i < new_num; i++) {
- error = rx_queue_add_kobject(net, i);
+ error = rx_queue_add_kobject(dev, i);
if (error) {
new_num = old_num;
break;
@@ -811,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
while (--i >= new_num) {
- if (net->sysfs_rx_queue_group)
- sysfs_remove_group(&net->_rx[i].kobj,
- net->sysfs_rx_queue_group);
- kobject_put(&net->_rx[i].kobj);
+ if (dev->sysfs_rx_queue_group)
+ sysfs_remove_group(&dev->_rx[i].kobj,
+ dev->sysfs_rx_queue_group);
+ kobject_put(&dev->_rx[i].kobj);
}
return error;
@@ -996,15 +1029,12 @@ static struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
- int i;
-
- for (i = 0; i < dev->num_tx_queues; i++)
- if (queue == &dev->_tx[i])
- break;
+ unsigned int i;
+ i = queue - dev->_tx;
BUG_ON(i >= dev->num_tx_queues);
return i;
@@ -1125,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = {
.namespace = netdev_queue_namespace,
};
-static int netdev_queue_add_kobject(struct net_device *net, int index)
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
{
- struct netdev_queue *queue = net->_tx + index;
+ struct netdev_queue *queue = dev->_tx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
- kobj->kset = net->queues_kset;
+ kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
if (error)
@@ -1154,14 +1184,14 @@ exit:
#endif /* CONFIG_SYSFS */
int
-netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
{
#ifdef CONFIG_SYSFS
int i;
int error = 0;
for (i = old_num; i < new_num; i++) {
- error = netdev_queue_add_kobject(net, i);
+ error = netdev_queue_add_kobject(dev, i);
if (error) {
new_num = old_num;
break;
@@ -1169,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
while (--i >= new_num) {
- struct netdev_queue *queue = net->_tx + i;
+ struct netdev_queue *queue = dev->_tx + i;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1183,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
-static int register_queue_kobjects(struct net_device *net)
+static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
#ifdef CONFIG_SYSFS
- net->queues_kset = kset_create_and_add("queues",
- NULL, &net->dev.kobj);
- if (!net->queues_kset)
+ dev->queues_kset = kset_create_and_add("queues",
+ NULL, &dev->dev.kobj);
+ if (!dev->queues_kset)
return -ENOMEM;
- real_rx = net->real_num_rx_queues;
+ real_rx = dev->real_num_rx_queues;
#endif
- real_tx = net->real_num_tx_queues;
+ real_tx = dev->real_num_tx_queues;
- error = net_rx_queue_update_kobjects(net, 0, real_rx);
+ error = net_rx_queue_update_kobjects(dev, 0, real_rx);
if (error)
goto error;
rxq = real_rx;
- error = netdev_queue_update_kobjects(net, 0, real_tx);
+ error = netdev_queue_update_kobjects(dev, 0, real_tx);
if (error)
goto error;
txq = real_tx;
@@ -1209,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net)
return 0;
error:
- netdev_queue_update_kobjects(net, txq, 0);
- net_rx_queue_update_kobjects(net, rxq, 0);
+ netdev_queue_update_kobjects(dev, txq, 0);
+ net_rx_queue_update_kobjects(dev, rxq, 0);
return error;
}
-static void remove_queue_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
#ifdef CONFIG_SYSFS
- real_rx = net->real_num_rx_queues;
+ real_rx = dev->real_num_rx_queues;
#endif
- real_tx = net->real_num_tx_queues;
+ real_tx = dev->real_num_tx_queues;
- net_rx_queue_update_kobjects(net, real_rx, 0);
- netdev_queue_update_kobjects(net, real_tx, 0);
+ net_rx_queue_update_kobjects(dev, real_rx, 0);
+ netdev_queue_update_kobjects(dev, real_tx, 0);
#ifdef CONFIG_SYSFS
- kset_unregister(net->queues_kset);
+ kset_unregister(dev->queues_kset);
#endif
}
@@ -1319,13 +1349,13 @@ static struct class net_class = {
/* Delete sysfs entries but hold kobject reference until after all
* netdev references are gone.
*/
-void netdev_unregister_kobject(struct net_device * net)
+void netdev_unregister_kobject(struct net_device *ndev)
{
- struct device *dev = &(net->dev);
+ struct device *dev = &(ndev->dev);
kobject_get(&dev->kobj);
- remove_queue_kobjects(net);
+ remove_queue_kobjects(ndev);
pm_runtime_set_memalloc_noio(dev, false);
@@ -1333,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net)
}
/* Create sysfs entries for network device. */
-int netdev_register_kobject(struct net_device *net)
+int netdev_register_kobject(struct net_device *ndev)
{
- struct device *dev = &(net->dev);
- const struct attribute_group **groups = net->sysfs_groups;
+ struct device *dev = &(ndev->dev);
+ const struct attribute_group **groups = ndev->sysfs_groups;
int error = 0;
device_initialize(dev);
dev->class = &net_class;
- dev->platform_data = net;
+ dev->platform_data = ndev;
dev->groups = groups;
- dev_set_name(dev, "%s", net->name);
+ dev_set_name(dev, "%s", ndev->name);
#ifdef CONFIG_SYSFS
/* Allow for a device specific group */
@@ -1354,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net)
*groups++ = &netstat_group;
#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
- if (net->ieee80211_ptr)
+ if (ndev->ieee80211_ptr)
*groups++ = &wireless_group;
#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- else if (net->wireless_handlers)
+ else if (ndev->wireless_handlers)
*groups++ = &wireless_group;
#endif
#endif
@@ -1367,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net)
if (error)
return error;
- error = register_queue_kobjects(net);
+ error = register_queue_kobjects(ndev);
if (error) {
device_del(dev);
return error;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 81d3a9a08453..7c6b51a58968 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -24,7 +24,7 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-static DEFINE_MUTEX(net_mutex);
+DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -273,7 +273,7 @@ static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp;
- LIST_HEAD(net_kill_list);
+ struct list_head net_kill_list;
LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
@@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
tsk = find_task_by_vpid(pid);
if (tsk) {
struct nsproxy *nsproxy;
- nsproxy = task_nsproxy(tsk);
+ task_lock(tsk);
+ nsproxy = tsk->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
+ task_unlock(tsk);
}
rcu_read_unlock();
return net;
@@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
struct net *net = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
- rcu_read_unlock();
+ task_unlock(task);
return net;
}
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 719efd541668..1f2a126f4ffa 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
- return css_cls_state(task_css(p, net_cls_subsys_id));
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
struct cgroup_cls_state *cs = css_cls_state(css);
- struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
if (parent)
cs->classid = parent->classid;
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
void *v = (void *)(unsigned long)cs->classid;
struct task_struct *p;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
@@ -102,19 +102,10 @@ static struct cftype ss_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
+struct cgroup_subsys net_cls_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = cgrp_attach,
- .subsys_id = net_cls_subsys_id,
- .base_cftypes = ss_files,
- .module = THIS_MODULE,
+ .legacy_cftypes = ss_files,
};
-
-static int __init init_netclassid_cgroup(void)
-{
- return cgroup_load_subsys(&net_cls_subsys);
-}
-__initcall(init_netclassid_cgroup);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a664f7829a6d..907fb5e36c02 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -46,13 +46,9 @@
static struct sk_buff_head skb_pool;
-static atomic_t trapped;
-
DEFINE_STATIC_SRCU(netpoll_srcu);
#define USEC_PER_POLL 50
-#define NETPOLL_RX_ENABLED 1
-#define NETPOLL_RX_DROP 2
#define MAX_SKB_SIZE \
(sizeof(struct ethhdr) + \
@@ -61,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
MAX_UDP_CHUNK)
static void zap_completion_queue(void);
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo);
static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
@@ -74,6 +69,37 @@ module_param(carrier_timeout, uint, 0644);
#define np_notice(np, fmt, ...) \
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (vlan_tx_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_put_tag(skb, skb->vlan_proto,
+ vlan_tx_tag_get(skb));
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ skb->vlan_tci = 0;
+ }
+
+ status = ops->ndo_start_xmit(skb, dev);
+ if (status == NETDEV_TX_OK)
+ txq_trans_update(txq);
+
+out:
+ return status;
+}
+
static void queue_process(struct work_struct *work)
{
struct netpoll_info *npinfo =
@@ -83,51 +109,31 @@ static void queue_process(struct work_struct *work)
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
- const struct net_device_ops *ops = dev->netdev_ops;
struct netdev_queue *txq;
if (!netif_device_present(dev) || !netif_running(dev)) {
- __kfree_skb(skb);
+ kfree_skb(skb);
continue;
}
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
local_irq_save(flags);
- __netif_tx_lock(txq, smp_processor_id());
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
- ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
schedule_delayed_work(&npinfo->tx_work, HZ/10);
return;
}
- __netif_tx_unlock(txq);
+ HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
}
}
-static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
- unsigned short ulen, __be32 saddr, __be32 daddr)
-{
- __wsum psum;
-
- if (uh->check == 0 || skb_csum_unnecessary(skb))
- return 0;
-
- psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
-
- if (skb->ip_summed == CHECKSUM_COMPLETE &&
- !csum_fold(csum_add(psum, skb->csum)))
- return 0;
-
- skb->csum = psum;
-
- return __skb_checksum_complete(skb);
-}
-
/*
* Check whether delayed processing was scheduled for our NIC. If so,
* we attempt to grab the poll lock and use ->poll() to pump the card.
@@ -138,14 +144,8 @@ static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
* trylock here and interrupts are already disabled in the softirq
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
- *
- * In cases where there is bi-directional communications, reading only
- * one message at a time can lead to packets being dropped by the
- * network adapter, forcing superfluous retries and possibly timeouts.
- * Thus, we set our budget to greater than 1.
*/
-static int poll_one_napi(struct netpoll_info *npinfo,
- struct napi_struct *napi, int budget)
+static int poll_one_napi(struct napi_struct *napi, int budget)
{
int work;
@@ -156,52 +156,35 @@ static int poll_one_napi(struct netpoll_info *npinfo,
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
return budget;
- npinfo->rx_flags |= NETPOLL_RX_DROP;
- atomic_inc(&trapped);
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
+ WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
- atomic_dec(&trapped);
- npinfo->rx_flags &= ~NETPOLL_RX_DROP;
return budget - work;
}
-static void poll_napi(struct net_device *dev)
+static void poll_napi(struct net_device *dev, int budget)
{
struct napi_struct *napi;
- int budget = 16;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner != smp_processor_id() &&
spin_trylock(&napi->poll_lock)) {
- budget = poll_one_napi(rcu_dereference_bh(dev->npinfo),
- napi, budget);
+ budget = poll_one_napi(napi, budget);
spin_unlock(&napi->poll_lock);
-
- if (!budget)
- break;
}
}
}
-static void service_neigh_queue(struct netpoll_info *npi)
-{
- if (npi) {
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&npi->neigh_tx)))
- netpoll_neigh_reply(skb, npi);
- }
-}
-
static void netpoll_poll_dev(struct net_device *dev)
{
const struct net_device_ops *ops;
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ int budget = 0;
/* Don't do any rx activity if the dev_lock mutex is held
* the dev_open/close paths use this to block netpoll activity
@@ -224,31 +207,14 @@ static void netpoll_poll_dev(struct net_device *dev)
/* Process pending work on NIC */
ops->ndo_poll_controller(dev);
- poll_napi(dev);
+ poll_napi(dev, budget);
up(&ni->dev_lock);
- if (dev->flags & IFF_SLAVE) {
- if (ni) {
- struct net_device *bond_dev;
- struct sk_buff *skb;
- struct netpoll_info *bond_ni;
-
- bond_dev = netdev_master_upper_dev_get_rcu(dev);
- bond_ni = rcu_dereference_bh(bond_dev->npinfo);
- while ((skb = skb_dequeue(&ni->neigh_tx))) {
- skb->dev = bond_dev;
- skb_queue_tail(&bond_ni->neigh_tx, skb);
- }
- }
- }
-
- service_neigh_queue(ni);
-
zap_completion_queue();
}
-void netpoll_rx_disable(struct net_device *dev)
+void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
int idx;
@@ -259,9 +225,9 @@ void netpoll_rx_disable(struct net_device *dev)
down(&ni->dev_lock);
srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_rx_disable);
+EXPORT_SYMBOL(netpoll_poll_disable);
-void netpoll_rx_enable(struct net_device *dev)
+void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
rcu_read_lock();
@@ -270,7 +236,7 @@ void netpoll_rx_enable(struct net_device *dev)
up(&ni->dev_lock);
rcu_read_unlock();
}
-EXPORT_SYMBOL(netpoll_rx_enable);
+EXPORT_SYMBOL(netpoll_poll_enable);
static void refill_skbs(void)
{
@@ -304,7 +270,7 @@ static void zap_completion_queue(void)
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
- if (skb->destructor) {
+ if (!skb_irq_freeable(skb)) {
atomic_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
} else {
@@ -359,7 +325,6 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
{
int status = NETDEV_TX_BUSY;
unsigned long tries;
- const struct net_device_ops *ops = dev->netdev_ops;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
@@ -367,7 +332,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
npinfo = rcu_dereference_bh(np->dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
- __kfree_skb(skb);
+ dev_kfree_skb_irq(skb);
return;
}
@@ -380,29 +345,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
- if (__netif_tx_trylock(txq)) {
- if (!netif_xmit_stopped(txq)) {
- if (vlan_tx_tag_present(skb) &&
- !vlan_hw_offload_capable(netif_skb_features(skb),
- skb->vlan_proto)) {
- skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
- if (unlikely(!skb)) {
- /* This is actually a packet drop, but we
- * don't want the code at the end of this
- * function to try and re-queue a NULL skb.
- */
- status = NETDEV_TX_OK;
- goto unlock_txq;
- }
- skb->vlan_tci = 0;
- }
-
- status = ops->ndo_start_xmit(skb, dev);
- if (status == NETDEV_TX_OK)
- txq_trans_update(txq);
- }
- unlock_txq:
- __netif_tx_unlock(txq);
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
+
+ HARD_TX_UNLOCK(dev, txq);
if (status == NETDEV_TX_OK)
break;
@@ -417,7 +364,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
WARN_ONCE(!irqs_disabled(),
"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
- dev->name, ops->ndo_start_xmit);
+ dev->name, dev->netdev_ops->ndo_start_xmit);
}
@@ -529,384 +476,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
}
EXPORT_SYMBOL(netpoll_send_udp);
-static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
- int size, type = ARPOP_REPLY;
- __be32 sip, tip;
- unsigned char *sha;
- struct sk_buff *send_skb;
- struct netpoll *np, *tmp;
- unsigned long flags;
- int hlen, tlen;
- int hits = 0, proto;
-
- if (list_empty(&npinfo->rx_np))
- return;
-
- /* Before checking the packet, we do some early
- inspection whether this is interesting at all */
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->dev == skb->dev)
- hits++;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
- /* No netpoll struct is using this dev */
- if (!hits)
- return;
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto == ETH_P_ARP) {
- struct arphdr *arp;
- unsigned char *arp_ptr;
- /* No arp on this interface */
- if (skb->dev->flags & IFF_NOARP)
- return;
-
- if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
- return;
-
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- arp = arp_hdr(skb);
-
- if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
- arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_op != htons(ARPOP_REQUEST))
- return;
-
- arp_ptr = (unsigned char *)(arp+1);
- /* save the location of the src hw addr */
- sha = arp_ptr;
- arp_ptr += skb->dev->addr_len;
- memcpy(&sip, arp_ptr, 4);
- arp_ptr += 4;
- /* If we actually cared about dst hw addr,
- it would get copied here */
- arp_ptr += skb->dev->addr_len;
- memcpy(&tip, arp_ptr, 4);
-
- /* Should we ignore arp? */
- if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
- return;
-
- size = arp_hdr_len(skb->dev);
-
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (tip != np->local_ip.ip)
- continue;
-
- hlen = LL_RESERVED_SPACE(np->dev);
- tlen = np->dev->needed_tailroom;
- send_skb = find_skb(np, size + hlen + tlen, hlen);
- if (!send_skb)
- continue;
-
- skb_reset_network_header(send_skb);
- arp = (struct arphdr *) skb_put(send_skb, size);
- send_skb->dev = skb->dev;
- send_skb->protocol = htons(ETH_P_ARP);
-
- /* Fill the device header for the ARP frame */
- if (dev_hard_header(send_skb, skb->dev, ETH_P_ARP,
- sha, np->dev->dev_addr,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- continue;
- }
-
- /*
- * Fill out the arp protocol part.
- *
- * we only support ethernet device type,
- * which (according to RFC 1390) should
- * always equal 1 (Ethernet).
- */
-
- arp->ar_hrd = htons(np->dev->type);
- arp->ar_pro = htons(ETH_P_IP);
- arp->ar_hln = np->dev->addr_len;
- arp->ar_pln = 4;
- arp->ar_op = htons(type);
-
- arp_ptr = (unsigned char *)(arp + 1);
- memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &tip, 4);
- arp_ptr += 4;
- memcpy(arp_ptr, sha, np->dev->addr_len);
- arp_ptr += np->dev->addr_len;
- memcpy(arp_ptr, &sip, 4);
-
- netpoll_send_skb(np, send_skb);
-
- /* If there are several rx_skb_hooks for the same
- * address we're fine by sending a single reply
- */
- break;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- } else if( proto == ETH_P_IPV6) {
-#if IS_ENABLED(CONFIG_IPV6)
- struct nd_msg *msg;
- u8 *lladdr = NULL;
- struct ipv6hdr *hdr;
- struct icmp6hdr *icmp6h;
- const struct in6_addr *saddr;
- const struct in6_addr *daddr;
- struct inet6_dev *in6_dev = NULL;
- struct in6_addr *target;
-
- in6_dev = in6_dev_get(skb->dev);
- if (!in6_dev || !in6_dev->cnf.accept_ra)
- return;
-
- if (!pskb_may_pull(skb, skb->len))
- return;
-
- msg = (struct nd_msg *)skb_transport_header(skb);
-
- __skb_push(skb, skb->data - skb_transport_header(skb));
-
- if (ipv6_hdr(skb)->hop_limit != 255)
- return;
- if (msg->icmph.icmp6_code != 0)
- return;
- if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
- return;
-
- saddr = &ipv6_hdr(skb)->saddr;
- daddr = &ipv6_hdr(skb)->daddr;
-
- size = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
-
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (!ipv6_addr_equal(daddr, &np->local_ip.in6))
- continue;
-
- hlen = LL_RESERVED_SPACE(np->dev);
- tlen = np->dev->needed_tailroom;
- send_skb = find_skb(np, size + hlen + tlen, hlen);
- if (!send_skb)
- continue;
-
- send_skb->protocol = htons(ETH_P_IPV6);
- send_skb->dev = skb->dev;
-
- skb_reset_network_header(send_skb);
- hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr));
- *(__be32*)hdr = htonl(0x60000000);
- hdr->payload_len = htons(size);
- hdr->nexthdr = IPPROTO_ICMPV6;
- hdr->hop_limit = 255;
- hdr->saddr = *saddr;
- hdr->daddr = *daddr;
-
- icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr));
- icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
- icmp6h->icmp6_router = 0;
- icmp6h->icmp6_solicited = 1;
-
- target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr));
- *target = msg->target;
- icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size,
- IPPROTO_ICMPV6,
- csum_partial(icmp6h,
- size, 0));
-
- if (dev_hard_header(send_skb, skb->dev, ETH_P_IPV6,
- lladdr, np->dev->dev_addr,
- send_skb->len) < 0) {
- kfree_skb(send_skb);
- continue;
- }
-
- netpoll_send_skb(np, send_skb);
-
- /* If there are several rx_skb_hooks for the same
- * address, we're fine by sending a single reply
- */
- break;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-#endif
- }
-}
-
-static bool pkt_is_ns(struct sk_buff *skb)
-{
- struct nd_msg *msg;
- struct ipv6hdr *hdr;
-
- if (skb->protocol != htons(ETH_P_ARP))
- return false;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
- return false;
-
- msg = (struct nd_msg *)skb_transport_header(skb);
- __skb_push(skb, skb->data - skb_transport_header(skb));
- hdr = ipv6_hdr(skb);
-
- if (hdr->nexthdr != IPPROTO_ICMPV6)
- return false;
- if (hdr->hop_limit != 255)
- return false;
- if (msg->icmph.icmp6_code != 0)
- return false;
- if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
- return false;
-
- return true;
-}
-
-int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
-{
- int proto, len, ulen, data_len;
- int hits = 0, offset;
- const struct iphdr *iph;
- struct udphdr *uh;
- struct netpoll *np, *tmp;
- uint16_t source;
-
- if (list_empty(&npinfo->rx_np))
- goto out;
-
- if (skb->dev->type != ARPHRD_ETHER)
- goto out;
-
- /* check if netpoll clients need ARP */
- if (skb->protocol == htons(ETH_P_ARP) && atomic_read(&trapped)) {
- skb_queue_tail(&npinfo->neigh_tx, skb);
- return 1;
- } else if (pkt_is_ns(skb) && atomic_read(&trapped)) {
- skb_queue_tail(&npinfo->neigh_tx, skb);
- return 1;
- }
-
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
- skb = vlan_untag(skb);
- if (unlikely(!skb))
- goto out;
- }
-
- proto = ntohs(eth_hdr(skb)->h_proto);
- if (proto != ETH_P_IP && proto != ETH_P_IPV6)
- goto out;
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto out;
- if (skb_shared(skb))
- goto out;
-
- if (proto == ETH_P_IP) {
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
- iph = (struct iphdr *)skb->data;
- if (iph->ihl < 5 || iph->version != 4)
- goto out;
- if (!pskb_may_pull(skb, iph->ihl*4))
- goto out;
- iph = (struct iphdr *)skb->data;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto out;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < iph->ihl*4)
- goto out;
-
- /*
- * Our transport medium may have padded the buffer out.
- * Now We trim to the true length of the frame.
- */
- if (pskb_trim_rcsum(skb, len))
- goto out;
-
- iph = (struct iphdr *)skb->data;
- if (iph->protocol != IPPROTO_UDP)
- goto out;
-
- len -= iph->ihl*4;
- uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
- offset = (unsigned char *)(uh + 1) - skb->data;
- ulen = ntohs(uh->len);
- data_len = skb->len - offset;
- source = ntohs(uh->source);
-
- if (ulen != len)
- goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
- goto out;
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (np->local_ip.ip && np->local_ip.ip != iph->daddr)
- continue;
- if (np->remote_ip.ip && np->remote_ip.ip != iph->saddr)
- continue;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- continue;
-
- np->rx_skb_hook(np, source, skb, offset, data_len);
- hits++;
- }
- } else {
-#if IS_ENABLED(CONFIG_IPV6)
- const struct ipv6hdr *ip6h;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto out;
- ip6h = (struct ipv6hdr *)skb->data;
- if (ip6h->version != 6)
- goto out;
- len = ntohs(ip6h->payload_len);
- if (!len)
- goto out;
- if (len + sizeof(struct ipv6hdr) > skb->len)
- goto out;
- if (pskb_trim_rcsum(skb, len + sizeof(struct ipv6hdr)))
- goto out;
- ip6h = ipv6_hdr(skb);
- if (!pskb_may_pull(skb, sizeof(struct udphdr)))
- goto out;
- uh = udp_hdr(skb);
- offset = (unsigned char *)(uh + 1) - skb->data;
- ulen = ntohs(uh->len);
- data_len = skb->len - offset;
- source = ntohs(uh->source);
- if (ulen != skb->len)
- goto out;
- if (udp6_csum_init(skb, uh, IPPROTO_UDP))
- goto out;
- list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
- if (!ipv6_addr_equal(&np->local_ip.in6, &ip6h->daddr))
- continue;
- if (!ipv6_addr_equal(&np->remote_ip.in6, &ip6h->saddr))
- continue;
- if (np->local_port && np->local_port != ntohs(uh->dest))
- continue;
-
- np->rx_skb_hook(np, source, skb, offset, data_len);
- hits++;
- }
-#endif
- }
-
- if (!hits)
- goto out;
-
- kfree_skb(skb);
- return 1;
-
-out:
- if (atomic_read(&trapped)) {
- kfree_skb(skb);
- return 1;
- }
-
- return 0;
-}
-
void netpoll_print_options(struct netpoll *np)
{
np_info(np, "local port %d\n", np->local_port);
@@ -1026,11 +595,10 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
}
EXPORT_SYMBOL(netpoll_parse_options);
-int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
- unsigned long flags;
int err;
np->dev = ndev;
@@ -1046,18 +614,13 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
}
if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), gfp);
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
goto out;
}
- npinfo->rx_flags = 0;
- INIT_LIST_HEAD(&npinfo->rx_np);
-
- spin_lock_init(&npinfo->rx_lock);
sema_init(&npinfo->dev_lock, 1);
- skb_queue_head_init(&npinfo->neigh_tx);
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
@@ -1065,7 +628,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo, gfp);
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
if (err)
goto free_npinfo;
}
@@ -1076,13 +639,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
npinfo->netpoll = np;
- if (np->rx_skb_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- list_add_tail(&np->rx, &npinfo->rx_np);
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -1204,7 +760,7 @@ int netpoll_setup(struct netpoll *np)
/* fill up the skb queue */
refill_skbs();
- err = __netpoll_setup(np, ndev, GFP_KERNEL);
+ err = __netpoll_setup(np, ndev);
if (err)
goto put;
@@ -1231,7 +787,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
struct netpoll_info *npinfo =
container_of(rcu_head, struct netpoll_info, rcu);
- skb_queue_purge(&npinfo->neigh_tx);
skb_queue_purge(&npinfo->txq);
/* we can't call cancel_delayed_work_sync here, as we are in softirq */
@@ -1247,7 +802,6 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- unsigned long flags;
/* rtnl_dereference would be preferable here but
* rcu_cleanup_netpoll path can put us in here safely without
@@ -1257,14 +811,6 @@ void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
- if (!list_empty(&npinfo->rx_np)) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_del(&np->rx);
- if (list_empty(&npinfo->rx_np))
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
synchronize_srcu(&netpoll_srcu);
if (atomic_dec_and_test(&npinfo->refcnt)) {
@@ -1274,9 +820,10 @@ void __netpoll_cleanup(struct netpoll *np)
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
- rcu_assign_pointer(np->dev->npinfo, NULL);
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
- }
+ } else
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
@@ -1308,18 +855,3 @@ out:
rtnl_unlock();
}
EXPORT_SYMBOL(netpoll_cleanup);
-
-int netpoll_trap(void)
-{
- return atomic_read(&trapped);
-}
-EXPORT_SYMBOL(netpoll_trap);
-
-void netpoll_set_trap(int trap)
-{
- if (trap)
- atomic_inc(&trapped);
- else
- atomic_dec(&trapped);
-}
-EXPORT_SYMBOL(netpoll_set_trap);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 9043caedcd08..cbd0a199bf52 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
- struct cgroup_subsys_state *parent_css = css_parent(css);
+ struct cgroup_subsys_state *parent_css = css->parent;
struct net_device *dev;
int ret = 0;
@@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v)
return 0;
}
-static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
int ret;
- if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
return -EINVAL;
dev = dev_get_by_name(&init_net, devname);
@@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
rtnl_lock();
- ret = netprio_set_prio(css, dev, prio);
+ ret = netprio_set_prio(of_css(of), dev, prio);
rtnl_unlock();
dev_put(dev);
- return ret;
+ return ret ?: nbytes;
}
static int update_netprio(const void *v, struct file *file, unsigned n)
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
struct task_struct *p;
void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, css, tset) {
+ cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
@@ -239,20 +239,17 @@ static struct cftype ss_files[] = {
{
.name = "ifpriomap",
.seq_show = read_priomap,
- .write_string = write_priomap,
+ .write = write_priomap,
},
{ } /* terminate */
};
-struct cgroup_subsys net_prio_subsys = {
- .name = "net_prio",
+struct cgroup_subsys net_prio_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = net_prio_attach,
- .subsys_id = net_prio_subsys_id,
- .base_cftypes = ss_files,
- .module = THIS_MODULE,
+ .legacy_cftypes = ss_files,
};
static int netprio_device_event(struct notifier_block *unused,
@@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = {
static int __init init_cgroup_netprio(void)
{
- int ret;
-
- ret = cgroup_load_subsys(&net_prio_subsys);
- if (ret)
- goto out;
-
register_netdevice_notifier(&netprio_device_notifier);
-
-out:
- return ret;
-}
-
-static void __exit exit_cgroup_netprio(void)
-{
- struct netprio_map *old;
- struct net_device *dev;
-
- unregister_netdevice_notifier(&netprio_device_notifier);
-
- cgroup_unload_subsys(&net_prio_subsys);
-
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- old = rtnl_dereference(dev->priomap);
- RCU_INIT_POINTER(dev->priomap, NULL);
- if (old)
- kfree_rcu(old, rcu);
- }
- rtnl_unlock();
+ return 0;
}
-module_init(init_cgroup_netprio);
-module_exit(exit_cgroup_netprio);
+subsys_initcall(init_cgroup_netprio);
MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fdac61cac1bd..8b849ddfef2e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -69,8 +69,9 @@
* for running devices in the if_list and sends packets until count is 0 it
* also the thread checks the thread->control which is used for inter-process
* communication. controlling process "posts" operations to the threads this
- * way. The if_lock should be possible to remove when add/rem_device is merged
- * into this too.
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
*
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
@@ -208,7 +209,7 @@
#define T_REMDEVALL (1<<2) /* Remove all devs */
#define T_REMDEV (1<<3) /* Remove one dev */
-/* If lock -- can be removed after some work */
+/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
#define if_unlock(t) spin_unlock(&(t->if_lock));
@@ -241,6 +242,7 @@ struct pktgen_dev {
struct proc_dir_entry *entry; /* proc file */
struct pktgen_thread *pg_thread;/* the owner */
struct list_head list; /* chaining in the thread's run-queue */
+ struct rcu_head rcu; /* freed by RCU */
int running; /* if false, the test will stop */
@@ -476,23 +478,22 @@ static int pgctrl_show(struct seq_file *seq, void *v)
static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- int err = 0;
char data[128];
struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
- if (!capable(CAP_NET_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
if (count > sizeof(data))
count = sizeof(data);
- if (copy_from_user(data, buf, count)) {
- err = -EFAULT;
- goto out;
- }
- data[count - 1] = 0; /* Make string */
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
pktgen_stop_all_threads_ifs(pn);
@@ -506,10 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
else
pr_warning("Unknown command: %s\n", data);
- err = count;
-
-out:
- return err;
+ return count;
}
static int pgctrl_open(struct inode *inode, struct file *file)
@@ -577,7 +575,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
is_zero_ether_addr(pkt_dev->src_mac) ?
pkt_dev->odev->dev_addr : pkt_dev->src_mac);
- seq_printf(seq, "dst_mac: ");
+ seq_puts(seq, "dst_mac: ");
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
@@ -592,7 +590,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->nr_labels) {
unsigned int i;
- seq_printf(seq, " mpls: ");
+ seq_puts(seq, " mpls: ");
for (i = 0; i < pkt_dev->nr_labels; i++)
seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
i == pkt_dev->nr_labels-1 ? "\n" : ", ");
@@ -617,67 +615,67 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->node >= 0)
seq_printf(seq, " node: %d\n", pkt_dev->node);
- seq_printf(seq, " Flags: ");
+ seq_puts(seq, " Flags: ");
if (pkt_dev->flags & F_IPV6)
- seq_printf(seq, "IPV6 ");
+ seq_puts(seq, "IPV6 ");
if (pkt_dev->flags & F_IPSRC_RND)
- seq_printf(seq, "IPSRC_RND ");
+ seq_puts(seq, "IPSRC_RND ");
if (pkt_dev->flags & F_IPDST_RND)
- seq_printf(seq, "IPDST_RND ");
+ seq_puts(seq, "IPDST_RND ");
if (pkt_dev->flags & F_TXSIZE_RND)
- seq_printf(seq, "TXSIZE_RND ");
+ seq_puts(seq, "TXSIZE_RND ");
if (pkt_dev->flags & F_UDPSRC_RND)
- seq_printf(seq, "UDPSRC_RND ");
+ seq_puts(seq, "UDPSRC_RND ");
if (pkt_dev->flags & F_UDPDST_RND)
- seq_printf(seq, "UDPDST_RND ");
+ seq_puts(seq, "UDPDST_RND ");
if (pkt_dev->flags & F_UDPCSUM)
- seq_printf(seq, "UDPCSUM ");
+ seq_puts(seq, "UDPCSUM ");
if (pkt_dev->flags & F_MPLS_RND)
- seq_printf(seq, "MPLS_RND ");
+ seq_puts(seq, "MPLS_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_RND)
- seq_printf(seq, "QUEUE_MAP_RND ");
+ seq_puts(seq, "QUEUE_MAP_RND ");
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
- seq_printf(seq, "QUEUE_MAP_CPU ");
+ seq_puts(seq, "QUEUE_MAP_CPU ");
if (pkt_dev->cflows) {
if (pkt_dev->flags & F_FLOW_SEQ)
- seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/
+ seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/
else
- seq_printf(seq, "FLOW_RND ");
+ seq_puts(seq, "FLOW_RND ");
}
#ifdef CONFIG_XFRM
if (pkt_dev->flags & F_IPSEC_ON) {
- seq_printf(seq, "IPSEC ");
+ seq_puts(seq, "IPSEC ");
if (pkt_dev->spi)
seq_printf(seq, "spi:%u", pkt_dev->spi);
}
#endif
if (pkt_dev->flags & F_MACSRC_RND)
- seq_printf(seq, "MACSRC_RND ");
+ seq_puts(seq, "MACSRC_RND ");
if (pkt_dev->flags & F_MACDST_RND)
- seq_printf(seq, "MACDST_RND ");
+ seq_puts(seq, "MACDST_RND ");
if (pkt_dev->flags & F_VID_RND)
- seq_printf(seq, "VID_RND ");
+ seq_puts(seq, "VID_RND ");
if (pkt_dev->flags & F_SVID_RND)
- seq_printf(seq, "SVID_RND ");
+ seq_puts(seq, "SVID_RND ");
if (pkt_dev->flags & F_NODE)
- seq_printf(seq, "NODE_ALLOC ");
+ seq_puts(seq, "NODE_ALLOC ");
seq_puts(seq, "\n");
@@ -720,7 +718,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
if (pkt_dev->result[0])
seq_printf(seq, "Result: %s\n", pkt_dev->result);
else
- seq_printf(seq, "Result: Idle\n");
+ seq_puts(seq, "Result: Idle\n");
return 0;
}
@@ -806,7 +804,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\t':
case ' ':
goto done_str;
- break;
default:
break;
}
@@ -1251,7 +1248,13 @@ static ssize_t pktgen_if_write(struct file *file,
"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
f,
"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n");
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
return count;
}
sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
@@ -1733,25 +1736,25 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
BUG_ON(!t);
- seq_printf(seq, "Running: ");
+ seq_puts(seq, "Running: ");
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
- seq_printf(seq, "\nStopped: ");
+ seq_puts(seq, "\nStopped: ");
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (!pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
if (t->result[0])
seq_printf(seq, "\nResult: %s\n", t->result);
else
- seq_printf(seq, "\nResult: NA\n");
+ seq_puts(seq, "\nResult: NA\n");
- if_unlock(t);
+ rcu_read_unlock();
return 0;
}
@@ -1876,10 +1879,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
pkt_dev = pktgen_find_dev(t, ifname, exact);
if (pkt_dev) {
if (remove) {
- if_lock(t);
pkt_dev->removal_mark = 1;
t->control |= T_REMDEV;
- if_unlock(t);
}
break;
}
@@ -1929,7 +1930,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
list_for_each_entry(t, &pn->pktgen_threads, th_list) {
struct pktgen_dev *pkt_dev;
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
if (pkt_dev->odev != dev)
continue;
@@ -1944,6 +1946,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
dev->name);
break;
}
+ rcu_read_unlock();
}
}
@@ -2995,8 +2998,8 @@ static void pktgen_run(struct pktgen_thread *t)
func_enter();
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
/*
* setup odev and create initial packet.
@@ -3005,18 +3008,18 @@ static void pktgen_run(struct pktgen_thread *t)
if (pkt_dev->odev) {
pktgen_clear_counters(pkt_dev);
- pkt_dev->running = 1; /* Cranke yeself! */
pkt_dev->skb = NULL;
pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
set_pkt_overhead(pkt_dev);
strcpy(pkt_dev->result, "Starting");
+ pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
strcpy(pkt_dev->result, "Error starting");
}
- if_unlock(t);
+ rcu_read_unlock();
if (started)
t->control &= ~(T_STOP);
}
@@ -3039,27 +3042,25 @@ static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
- list_for_each_entry(pkt_dev, &t->if_list, list)
- if (pkt_dev->running)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running) {
+ rcu_read_unlock();
return 1;
+ }
+ rcu_read_unlock();
return 0;
}
static int pktgen_wait_thread_run(struct pktgen_thread *t)
{
- if_lock(t);
-
while (thread_is_running(t)) {
- if_unlock(t);
-
msleep_interruptible(100);
if (signal_pending(current))
goto signal;
- if_lock(t);
}
- if_unlock(t);
return 1;
signal:
return 0;
@@ -3164,10 +3165,10 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
return -EINVAL;
}
+ pkt_dev->running = 0;
kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
pkt_dev->stopped_at = ktime_get();
- pkt_dev->running = 0;
show_results(pkt_dev, nr_frags);
@@ -3178,9 +3179,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
{
struct pktgen_dev *pkt_dev, *best = NULL;
- if_lock(t);
-
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
if (!pkt_dev->running)
continue;
if (best == NULL)
@@ -3188,7 +3188,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
best = pkt_dev;
}
- if_unlock(t);
+ rcu_read_unlock();
+
return best;
}
@@ -3198,13 +3199,13 @@ static void pktgen_stop(struct pktgen_thread *t)
func_enter();
- if_lock(t);
+ rcu_read_lock();
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
pktgen_stop_device(pkt_dev);
}
- if_unlock(t);
+ rcu_read_unlock();
}
/*
@@ -3218,8 +3219,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
func_enter();
- if_lock(t);
-
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
@@ -3233,8 +3232,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
break;
}
-
- if_unlock(t);
}
static void pktgen_rem_all_ifs(struct pktgen_thread *t)
@@ -3246,8 +3243,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
/* Remove all devices, free mem */
- if_lock(t);
-
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
@@ -3256,8 +3251,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
pktgen_remove_device(t, cur);
}
-
- if_unlock(t);
}
static void pktgen_rem_thread(struct pktgen_thread *t)
@@ -3336,9 +3329,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
queue_map = skb_get_queue_mapping(pkt_dev->skb);
txq = netdev_get_tx_queue(odev, queue_map);
- __netif_tx_lock_bh(txq);
+ local_bh_disable();
- if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
@@ -3372,7 +3367,9 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->last_ok = 0;
}
unlock:
- __netif_tx_unlock_bh(txq);
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
@@ -3401,10 +3398,10 @@ static int pktgen_thread_worker(void *arg)
pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
- set_current_state(TASK_INTERRUPTIBLE);
-
set_freezable();
+ __set_current_state(TASK_RUNNING);
+
while (!kthread_should_stop()) {
pkt_dev = next_to_run(t);
@@ -3418,8 +3415,6 @@ static int pktgen_thread_worker(void *arg)
continue;
}
- __set_current_state(TASK_RUNNING);
-
if (likely(pkt_dev)) {
pktgen_xmit(pkt_dev);
@@ -3450,9 +3445,8 @@ static int pktgen_thread_worker(void *arg)
}
try_to_freeze();
-
- set_current_state(TASK_INTERRUPTIBLE);
}
+ set_current_state(TASK_INTERRUPTIBLE);
pr_debug("%s stopping all device\n", t->tsk->comm);
pktgen_stop(t);
@@ -3479,8 +3473,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
struct pktgen_dev *p, *pkt_dev = NULL;
size_t len = strlen(ifname);
- if_lock(t);
- list_for_each_entry(p, &t->if_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &t->if_list, list)
if (strncmp(p->odevname, ifname, len) == 0) {
if (p->odevname[len]) {
if (exact || p->odevname[len] != '@')
@@ -3490,7 +3484,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
break;
}
- if_unlock(t);
+ rcu_read_unlock();
pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
return pkt_dev;
}
@@ -3504,6 +3498,12 @@ static int add_dev_to_thread(struct pktgen_thread *t,
{
int rv = 0;
+ /* This function cannot be called concurrently, as its called
+ * under pktgen_thread_lock mutex, but it can run from
+ * userspace on another CPU than the kthread. The if_lock()
+ * is used here to sync with concurrent instances of
+ * _rem_dev_from_if_list() invoked via kthread, which is also
+ * updating the if_list */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3512,9 +3512,9 @@ static int add_dev_to_thread(struct pktgen_thread *t,
goto out;
}
- list_add(&pkt_dev->list, &t->if_list);
- pkt_dev->pg_thread = t;
pkt_dev->running = 0;
+ pkt_dev->pg_thread = t;
+ list_add_rcu(&pkt_dev->list, &t->if_list);
out:
if_unlock(t);
@@ -3669,11 +3669,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
struct list_head *q, *n;
struct pktgen_dev *p;
+ if_lock(t);
list_for_each_safe(q, n, &t->if_list) {
p = list_entry(q, struct pktgen_dev, list);
if (p == pkt_dev)
- list_del(&p->list);
+ list_del_rcu(&p->list);
}
+ if_unlock(t);
}
static int pktgen_remove_device(struct pktgen_thread *t,
@@ -3693,20 +3695,22 @@ static int pktgen_remove_device(struct pktgen_thread *t,
pkt_dev->odev = NULL;
}
- /* And update the thread if_list */
-
- _rem_dev_from_if_list(t, pkt_dev);
-
+ /* Remove proc before if_list entry, because add_device uses
+ * list to determine if interface already exist, avoid race
+ * with proc_create_data() */
if (pkt_dev->entry)
proc_remove(pkt_dev->entry);
+ /* And update the thread if_list */
+ _rem_dev_from_if_list(t, pkt_dev);
+
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
#endif
vfree(pkt_dev->flows);
if (pkt_dev->page)
put_page(pkt_dev->page);
- kfree(pkt_dev);
+ kfree_rcu(pkt_dev, rcu);
return 0;
}
@@ -3806,6 +3810,7 @@ static void __exit pg_cleanup(void)
{
unregister_netdevice_notifier(&pktgen_notifier_block);
unregister_pernet_subsys(&pg_net_ops);
+ /* Don't need rcu_barrier() due to use of kfree_rcu() */
}
module_init(pg_init);
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 000000000000..4eab4a94a59d
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,193 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * ret a ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
+ * ldb [27] ; load proto
+ * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
+ * ldh [24] ; load frag offset field
+ * jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ * ldxb 4*([18]&0xf) ; load IP header len
+ * ldh [x + 20] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 26] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ * ldb [24] ; load proto
+ * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
+ * ldh [60] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [66] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x30 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct bpf_prog *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return BPF_PROG_RUN(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 32, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 35, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000070 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x0000001b },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000018 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x00000012 },
+ { 0x48, 0, 0, 0x00000014 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x0000001a },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000050 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 8, 0x000086dd },
+ { 0x30, 0, 0, 0x00000018 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x0000003c },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x00000042 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000060 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000030 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 4425148d2b51..04db318e6218 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt;
+ struct listen_sock *lopt = NULL;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
- if (lopt_size > PAGE_SIZE)
+
+ if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ lopt = kzalloc(lopt_size, GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
+ if (!lopt)
lopt = vzalloc(lopt_size);
- else
- lopt = kzalloc(lopt_size, GFP_KERNEL);
- if (lopt == NULL)
+ if (!lopt)
return -ENOMEM;
- for (lopt->max_qlen_log = 3;
- (1 << lopt->max_qlen_log) < nr_table_entries;
- lopt->max_qlen_log++);
-
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
+ lopt->max_qlen_log = ilog2(nr_table_entries);
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
@@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
void __reqsk_queue_destroy(struct request_sock_queue *queue)
{
- struct listen_sock *lopt;
- size_t lopt_size;
-
- /*
- * this is an error recovery path only
- * no locking needed and the lopt is not NULL
- */
-
- lopt = queue->listen_opt;
- lopt_size = sizeof(struct listen_sock) +
- lopt->nr_table_entries * sizeof(struct request_sock *);
-
- if (lopt_size > PAGE_SIZE)
- vfree(lopt);
- else
- kfree(lopt);
+ /* This is an error recovery path only, no locking needed */
+ kvfree(queue->listen_opt);
}
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
@@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
- size_t lopt_size = sizeof(struct listen_sock) +
- lopt->nr_table_entries * sizeof(struct request_sock *);
if (lopt->qlen != 0) {
unsigned int i;
@@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
}
WARN_ON(lopt->qlen != 0);
- if (lopt_size > PAGE_SIZE)
- vfree(lopt);
- else
- kfree(lopt);
+ kvfree(lopt);
}
/*
@@ -221,5 +202,4 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
out:
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
- return;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1a0dac2ef9ad..f0493e3b7471 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (rtnl_link_ops_get(ops->kind))
return -EEXIST;
- if (!ops->dellink)
+ /* The check for setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if (ops->setup && !ops->dellink)
ops->dellink = unregister_netdevice_queue;
list_add_tail(&ops->list, &link_ops);
@@ -353,15 +358,46 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)
}
EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&netdev_unregistering_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ unregistering = false;
+ rtnl_lock();
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+ schedule();
+ }
+ finish_wait(&netdev_unregistering_wq, &wait);
+}
+
/**
* rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
- rtnl_lock();
+ /* Close the race with cleanup_net() */
+ mutex_lock(&net_mutex);
+ rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
+ mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
@@ -767,14 +803,16 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
- nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
- nla_total_size(sizeof(struct ifla_vf_spoofchk)));
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_link_state)));
return size;
} else
return 0;
}
-static size_t rtnl_port_size(const struct net_device *dev)
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
size_t port_size = nla_total_size(4) /* PORT_VF */
+ nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
@@ -790,7 +828,8 @@ static size_t rtnl_port_size(const struct net_device *dev)
size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ port_size;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
if (dev_num_vf(dev->dev.parent))
return port_self_size + vf_ports_size +
@@ -822,10 +861,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
- + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
@@ -887,11 +927,13 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
{
int err;
- if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
return 0;
err = rtnl_port_self_fill(skb, dev);
@@ -970,7 +1012,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
(dev->qdisc &&
nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
(dev->ifalias &&
- nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)))
+ nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_changes)))
goto nla_put_failure;
if (1) {
@@ -1027,6 +1071,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct ifla_vf_info ivi;
struct ifla_vf_mac vf_mac;
struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_link_state vf_linkstate;
@@ -1047,6 +1092,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
break;
vf_mac.vf =
vf_vlan.vf =
+ vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
vf_linkstate.vf = ivi.vf;
@@ -1054,7 +1100,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
- vf_tx_rate.rate = ivi.tx_rate;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
vf_spoofchk.setting = ivi.spoofchk;
vf_linkstate.link_state = ivi.linkstate;
vf = nla_nest_start(skb, IFLA_VF_INFO);
@@ -1064,6 +1112,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
}
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
&vf_tx_rate) ||
nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
@@ -1076,7 +1126,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_nest_end(skb, vfinfo);
}
- if (rtnl_port_fill(skb, dev))
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
goto nla_put_failure;
if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
@@ -1121,56 +1171,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
- struct nlattr *tb[IFLA_MAX+1];
- u32 ext_filter_mask = 0;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- rcu_read_lock();
- cb->seq = net->dev_base_seq;
-
- if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
- ifla_policy) >= 0) {
-
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
- }
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI,
- ext_filter_mask) <= 0)
- goto out;
-
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
- }
- }
-out:
- rcu_read_unlock();
- cb->args[1] = idx;
- cb->args[0] = h;
-
- return skb->len;
-}
-
-const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1196,8 +1197,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
};
-EXPORT_SYMBOL(ifla_policy);
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
[IFLA_INFO_KIND] = { .type = NLA_STRING },
@@ -1219,6 +1220,10 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
.len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY,
.len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_vf_link_state) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1235,6 +1240,78 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ int err;
+ int hdrlen;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rcu_read_lock();
+ cb->seq = net->dev_base_seq;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI,
+ ext_filter_mask);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err <= 0)
+ goto out;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
+{
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
struct net *net;
@@ -1316,11 +1393,29 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
}
case IFLA_VF_TX_RATE: {
struct ifla_vf_tx_rate *ivt;
+ struct ifla_vf_info ivf;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf,
+ &ivf);
+ if (err)
+ break;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ break;
+ }
+ case IFLA_VF_RATE: {
+ struct ifla_vf_rate *ivt;
ivt = nla_data(vf);
err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_tx_rate)
- err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
- ivt->rate);
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
break;
}
case IFLA_VF_SPOOFCHK: {
@@ -1386,7 +1481,8 @@ static int do_set_master(struct net_device *dev, int ifindex)
return 0;
}
-static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
struct nlattr **tb, char *ifname, int modified)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1398,7 +1494,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
err = PTR_ERR(net);
goto errout;
}
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
err = -EPERM;
goto errout;
}
@@ -1652,7 +1748,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
- err = do_setlink(dev, ifm, tb, ifname, 0);
+ err = do_setlink(skb, dev, ifm, tb, ifname, 0);
errout:
return err;
}
@@ -1687,12 +1783,11 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
return -ENODEV;
ops = dev->rtnl_link_ops;
- if (!ops)
+ if (!ops || !ops->dellink)
return -EOPNOTSUPP;
ops->dellink(dev, &list_kill);
unregister_netdevice_many(&list_kill);
- list_del(&list_kill);
return 0;
}
@@ -1716,7 +1811,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
EXPORT_SYMBOL(rtnl_configure_link);
struct net_device *rtnl_create_link(struct net *net,
- char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
+ char *ifname, unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
int err;
struct net_device *dev;
@@ -1734,8 +1830,8 @@ struct net_device *rtnl_create_link(struct net *net,
num_rx_queues = ops->get_num_rx_queues();
err = -ENOMEM;
- dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
- num_tx_queues, num_rx_queues);
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
+ ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
goto err;
@@ -1769,7 +1865,8 @@ err:
}
EXPORT_SYMBOL(rtnl_create_link);
-static int rtnl_group_changelink(struct net *net, int group,
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
struct ifinfomsg *ifm,
struct nlattr **tb)
{
@@ -1778,7 +1875,7 @@ static int rtnl_group_changelink(struct net *net, int group,
for_each_netdev(net, dev) {
if (dev->group == group) {
- err = do_setlink(dev, ifm, tb, NULL, 0);
+ err = do_setlink(skb, dev, ifm, tb, NULL, 0);
if (err < 0)
return err;
}
@@ -1799,6 +1896,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ unsigned char name_assign_type = NET_NAME_USER;
int err;
#ifdef CONFIG_MODULES
@@ -1920,12 +2018,12 @@ replay:
modified = 1;
}
- return do_setlink(dev, ifm, tb, ifname, modified);
+ return do_setlink(skb, dev, ifm, tb, ifname, modified);
}
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
- return rtnl_group_changelink(net,
+ return rtnl_group_changelink(skb, net,
nla_get_u32(tb[IFLA_GROUP]),
ifm, tb);
return -ENODEV;
@@ -1948,14 +2046,19 @@ replay:
return -EOPNOTSUPP;
}
- if (!ifname[0])
+ if (!ops->setup)
+ return -EOPNOTSUPP;
+
+ if (!ifname[0]) {
snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
dest_net = rtnl_link_get_net(net, tb);
if (IS_ERR(dest_net))
return PTR_ERR(dest_net);
- dev = rtnl_create_link(dest_net, ifname, ops, tb);
+ dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
@@ -1966,11 +2069,15 @@ replay:
if (ops->newlink) {
err = ops->newlink(net, dev, tb, data);
/* Drivers should call free_netdev() in ->destructor
- * and unregister it on failure so that device could be
- * finally freed in rtnl_unlock.
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
*/
- if (err < 0)
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(dev);
goto out;
+ }
} else {
err = register_netdevice(dev);
if (err < 0) {
@@ -2042,9 +2149,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
- ifla_policy) >= 0) {
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
}
@@ -2121,12 +2232,13 @@ EXPORT_SYMBOL(rtmsg_ifinfo);
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
struct net_device *dev,
u8 *addr, u32 pid, u32 seq,
- int type, unsigned int flags)
+ int type, unsigned int flags,
+ int nlflags)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
if (!nlh)
return -EMSGSIZE;
@@ -2164,7 +2276,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, int type)
if (!skb)
goto errout;
- err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, 0, 0, type, NTF_SELF, 0);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -2281,22 +2393,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct net_device *dev,
const unsigned char *addr)
{
- int err = -EOPNOTSUPP;
+ int err = -EINVAL;
/* If aging addresses are supported device will need to
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
pr_info("%s: FDB only supports static addresses\n", dev->name);
- return -EINVAL;
+ return err;
}
if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
err = dev_uc_del(dev, addr);
else if (is_multicast_ether_addr(addr))
err = dev_mc_del(dev, addr);
- else
- err = -EINVAL;
return err;
}
@@ -2311,7 +2421,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
int err = -EINVAL;
__u8 *addr;
- if (!capable(CAP_NET_ADMIN))
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2389,7 +2499,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr,
portid, seq,
- RTM_NEWNEIGH, NTF_SELF);
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI);
if (err < 0)
return err;
skip:
@@ -2409,6 +2520,7 @@ skip:
int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
+ struct net_device *filter_dev,
int idx)
{
int err;
@@ -2426,28 +2538,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump);
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx = 0;
- struct net *net = sock_net(skb->sk);
struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *bdev = NULL;
+ struct net_device *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ const struct net_device_ops *cops = NULL;
+ struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ int brport_idx = 0;
+ int br_idx = 0;
+ int idx = 0;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- struct net_device *br_dev;
- const struct net_device_ops *ops;
+ if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
+ ifla_policy) == 0) {
+ if (tb[IFLA_MASTER])
+ br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
- br_dev = netdev_master_upper_dev_get(dev);
- ops = br_dev->netdev_ops;
- if (ops->ndo_fdb_dump)
- idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
+ brport_idx = ifm->ifi_index;
+
+ if (br_idx) {
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev)
+ return -ENODEV;
+
+ ops = br_dev->netdev_ops;
+ bdev = br_dev;
+ }
+
+ for_each_netdev(net, dev) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
+
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+
+ bdev = dev;
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
+
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+
+ bdev = br_dev;
+ cops = ops;
}
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump)
+ idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ idx);
+ }
+
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
if (dev->netdev_ops->ndo_fdb_dump)
- idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
- else
- idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
+ idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev,
+ idx);
+
+ cops = NULL;
}
- rcu_read_unlock();
cb->args[0] = idx;
return skb->len;
@@ -2762,7 +2918,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sz_idx = type>>2;
kind = type&3;
- if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 897da56f3aff..ba71212f0251 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -85,31 +85,6 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-__u32 secure_ip_id(__be32 daddr)
-{
- u32 hash[MD5_DIGEST_WORDS];
-
- net_secret_init();
- hash[0] = (__force __u32) daddr;
- hash[1] = net_secret[13];
- hash[2] = net_secret[14];
- hash[3] = net_secret[15];
-
- md5_transform(hash, net_secret);
-
- return hash[0];
-}
-
-__u32 secure_ipv6_id(const __be32 daddr[4])
-{
- __u32 hash[4];
-
- net_secret_init();
- memcpy(hash, daddr, 16);
- md5_transform(hash, net_secret);
-
- return hash[0];
-}
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 869c7afe3b07..8d289697cc7a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -689,12 +690,15 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->ooo_okay = old->ooo_okay;
new->no_fcs = old->no_fcs;
new->encapsulation = old->encapsulation;
+ new->encap_hdr_csum = old->encap_hdr_csum;
+ new->csum_valid = old->csum_valid;
+ new->csum_complete_sw = old->csum_complete_sw;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
memcpy(new->cb, old->cb, sizeof(old->cb));
new->csum = old->csum;
- new->local_df = old->local_df;
+ new->ignore_df = old->ignore_df;
new->pkt_type = old->pkt_type;
new->ip_summed = old->ip_summed;
skb_copy_queue_mapping(new, old);
@@ -951,10 +955,13 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * __pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
* @headroom: headroom of new skb
* @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
@@ -964,11 +971,12 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
{
unsigned int size = skb_headlen(skb) + headroom;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
@@ -1008,7 +1016,7 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(__pskb_copy);
+EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -2127,25 +2135,31 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
*
* The `hlen` as calculated by skb_zerocopy_headlen() specifies the
* headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
*/
-void
-skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
int i, j = 0;
int plen = 0; /* length of skb->head fragment */
+ int ret;
struct page *page;
unsigned int offset;
BUG_ON(!from->head_frag && !hlen);
/* dont bother with small payloads */
- if (len <= skb_tailroom(to)) {
- skb_copy_bits(from, 0, skb_put(to, len), len);
- return;
- }
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
if (hlen) {
- skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
len -= hlen;
} else {
plen = min_t(int, skb_headlen(from), len);
@@ -2163,6 +2177,11 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
to->len += len + plen;
to->data_len += len + plen;
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
if (!len)
break;
@@ -2173,6 +2192,8 @@ skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
j++;
}
skb_shinfo(to)->nr_frags = j;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);
@@ -2626,7 +2647,7 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
* skb_seq_read() will return the remaining part of the block.
*
* Note 1: The size of each block of data returned can be arbitrary,
- * this limitation is the cost for zerocopy seqeuental
+ * this limitation is the cost for zerocopy sequential
* reads of potentially non linear data.
*
* Note 2: Fragment lists within fragments are not implemented
@@ -2760,7 +2781,7 @@ EXPORT_SYMBOL(skb_find_text);
/**
* skb_append_datato_frags - append the user data to a skb
* @sk: sock structure
- * @skb: skb structure to be appened with user data.
+ * @skb: skb structure to be appended with user data.
* @getfrag: call back function to be used for getting the user data
* @from: pointer to user message iov
* @length: length of the iov message
@@ -2866,13 +2887,16 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int err = -ENOMEM;
int i = 0;
int pos;
+ int dummy;
- proto = skb_network_protocol(head_skb);
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
- csum = !!can_checksum_protocol(features, proto);
- __skb_push(head_skb, doffset);
+ csum = !head_skb->encap_hdr_csum &&
+ !!can_checksum_protocol(features, proto);
+
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -2953,9 +2977,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
tail = nskb;
__copy_skb_header(nskb, head_skb);
- nskb->mac_len = head_skb->mac_len;
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+ skb_reset_mac_len(nskb);
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen,
@@ -2969,6 +2993,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb, len),
len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
continue;
}
@@ -3038,6 +3064,8 @@ perform_csum_check:
nskb->csum = skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
@@ -3062,7 +3090,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
if (unlikely(p->len + len >= 65536))
return -E2BIG;
- lp = NAPI_GRO_CB(p)->last ?: p;
+ lp = NAPI_GRO_CB(p)->last;
pinfo = skb_shinfo(lp);
if (headlen <= offset) {
@@ -3124,6 +3152,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
+ /* switch back to head shinfo */
+ pinfo = skb_shinfo(p);
+
if (pinfo->frag_list)
goto merge;
if (skb_gro_len(p) != pinfo->gso_size)
@@ -3178,7 +3209,7 @@ merge:
__skb_pull(skb, offset);
- if (!NAPI_GRO_CB(p)->last)
+ if (NAPI_GRO_CB(p)->last == p)
skb_shinfo(p)->frag_list = skb;
else
NAPI_GRO_CB(p)->last->next = skb;
@@ -3286,6 +3317,32 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
return elt;
}
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
int nsg = __skb_to_sgvec(skb, sg, offset, len);
@@ -3418,8 +3475,6 @@ static void sock_rmem_free(struct sk_buff *skb)
*/
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
- int len = skb->len;
-
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
@@ -3434,15 +3489,15 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
-void skb_tstamp_tx(struct sk_buff *orig_skb,
- struct skb_shared_hwtstamps *hwtstamps)
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
{
- struct sock *sk = orig_skb->sk;
struct sock_exterr_skb *serr;
struct sk_buff *skb;
int err;
@@ -3470,12 +3525,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = tstype;
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk->sk_protocol == IPPROTO_TCP)
+ serr->ee.ee_data -= sk->sk_tskey;
+ }
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
+}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
@@ -3548,15 +3617,47 @@ static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
return 0;
}
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
/* This value should be large enough to cover a tagged ethernet header plus
* maximally sized IP and TCP or UDP headers.
*/
#define MAX_IP_HDR_LEN 128
-static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate)
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
unsigned int off;
bool fragment;
+ __sum16 *csum;
int err;
fragment = false;
@@ -3577,51 +3678,15 @@ static int skb_checksum_setup_ip(struct sk_buff *skb, bool recalculate)
if (fragment)
goto out;
- switch (ip_hdr(skb)->protocol) {
- case IPPROTO_TCP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct tcphdr),
- MAX_IP_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct tcphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- tcp_hdr(skb)->check =
- ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_TCP, 0);
- break;
- case IPPROTO_UDP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct udphdr),
- MAX_IP_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct udphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- udp_hdr(skb)->check =
- ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
- ip_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_UDP, 0);
- break;
- default:
- goto out;
- }
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
err = 0;
out:
@@ -3644,6 +3709,7 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
unsigned int len;
bool fragment;
bool done;
+ __sum16 *csum;
fragment = false;
done = false;
@@ -3721,51 +3787,14 @@ static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
if (!done || fragment)
goto out;
- switch (nexthdr) {
- case IPPROTO_TCP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct tcphdr),
- MAX_IPV6_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct tcphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- tcp_hdr(skb)->check =
- ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_TCP, 0);
- break;
- case IPPROTO_UDP:
- err = skb_maybe_pull_tail(skb,
- off + sizeof(struct udphdr),
- MAX_IPV6_HDR_LEN);
- if (err < 0)
- goto out;
-
- if (!skb_partial_csum_set(skb, off,
- offsetof(struct udphdr, check))) {
- err = -EPROTO;
- goto out;
- }
-
- if (recalculate)
- udp_hdr(skb)->check =
- ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr,
- skb->len - off,
- IPPROTO_UDP, 0);
- break;
- default:
- goto out;
- }
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
err = 0;
out:
@@ -3783,7 +3812,7 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
switch (skb->protocol) {
case htons(ETH_P_IP):
- err = skb_checksum_setup_ip(skb, recalculate);
+ err = skb_checksum_setup_ipv4(skb, recalculate);
break;
case htons(ETH_P_IPV6):
@@ -3915,7 +3944,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
- skb->local_df = 0;
+ skb->ignore_df = 0;
skb_dst_drop(skb);
skb->mark = 0;
secpath_reset(skb);
@@ -3937,12 +3966,66 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
- unsigned int hdr_len;
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- hdr_len = tcp_hdrlen(skb);
- else
- hdr_len = sizeof(struct udphdr);
- return hdr_len + shinfo->gso_size;
+ return tcp_hdrlen(skb) + shinfo->gso_size;
+
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return shinfo->gso_size;
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+ skb->mac_header += VLAN_HLEN;
+ return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+ struct vlan_hdr *vhdr;
+ u16 vlan_tci;
+
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ /* vlan_tci is already set-up so leave this for another time */
+ return skb;
+ }
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto err_free;
+
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+ goto err_free;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+ skb_pull_rcsum(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vhdr);
+
+ skb = skb_reorder_vlan_header(skb);
+ if (unlikely(!skb))
+ goto err_free;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
diff --git a/net/core/sock.c b/net/core/sock.c
index 2f143c3b190a..611f424fb76b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,55 @@
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
#ifdef CONFIG_MEMCG_KMEM
int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
@@ -428,7 +477,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, skb_len);
+ sk->sk_data_ready(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);
@@ -442,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -735,7 +784,7 @@ set_rcvbuf:
break;
case SO_NO_CHECK:
- sk->sk_no_check = valbool;
+ sk->sk_no_check_tx = valbool;
break;
case SO_PRIORITY:
@@ -799,24 +848,25 @@ set_rcvbuf:
ret = -EINVAL;
break;
}
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
- val & SOF_TIMESTAMPING_TX_HARDWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
- val & SOF_TIMESTAMPING_TX_SOFTWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
- val & SOF_TIMESTAMPING_RX_HARDWARE);
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_tskey = tcp_sk(sk)->snd_una;
+ } else {
+ sk->sk_tskey = 0;
+ }
+ }
+ sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
- val & SOF_TIMESTAMPING_SOFTWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
- val & SOF_TIMESTAMPING_SYS_HARDWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
- val & SOF_TIMESTAMPING_RAW_HARDWARE);
break;
case SO_RCVLOWAT:
@@ -1015,7 +1065,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_NO_CHECK:
- v.val = sk->sk_no_check;
+ v.val = sk->sk_no_check_tx;
break;
case SO_PRIORITY:
@@ -1042,21 +1092,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_TIMESTAMPING:
- v.val = 0;
- if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
- v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
- v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
- v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
- v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ v.val = sk->sk_tsflags;
break;
case SO_RCVTIMEO:
@@ -1429,6 +1465,7 @@ static void sk_update_clone(const struct sock *sk, struct sock *newsk)
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
+ bool is_charged = true;
newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
if (newsk != NULL) {
@@ -1470,9 +1507,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
filter = rcu_dereference_protected(newsk->sk_filter, 1);
if (filter != NULL)
- sk_filter_charge(newsk, filter);
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
@@ -1778,6 +1819,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
order);
if (page)
goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
}
order--;
}
@@ -1819,16 +1863,14 @@ EXPORT_SYMBOL(sock_alloc_send_skb);
* skb_page_frag_refill - check that a page_frag contains enough room
* @sz: minimum size of the fragment we want to get
* @pfrag: pointer to page_frag
- * @prio: priority for memory allocation
+ * @gfp: priority for memory allocation
*
* Note: While this allocator tries to use high order pages, there is
* no guarantee that allocations succeed. Therefore, @sz MUST be
* less or equal than PAGE_SIZE.
*/
-bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
- int order;
-
if (pfrag->page) {
if (atomic_read(&pfrag->page->_count) == 1) {
pfrag->offset = 0;
@@ -1839,20 +1881,21 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
put_page(pfrag->page);
}
- order = SKB_FRAG_PAGE_ORDER;
- do {
- gfp_t gfp = prio;
-
- if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
- pfrag->page = alloc_pages(gfp, order);
+ pfrag->offset = 0;
+ if (SKB_FRAG_PAGE_ORDER) {
+ pfrag->page = alloc_pages(gfp | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
- pfrag->offset = 0;
- pfrag->size = PAGE_SIZE << order;
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
return true;
}
- } while (--order >= 0);
-
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ return true;
+ }
return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);
@@ -2193,7 +2236,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk, int len)
+static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a0e9cf6379de..ad704c757bb4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -49,38 +49,35 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
}
EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
-int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
struct sk_buff *skb, int attrtype)
{
- struct nlattr *attr;
+ struct sock_fprog_kern *fprog;
struct sk_filter *filter;
- unsigned int len;
+ struct nlattr *attr;
+ unsigned int flen;
int err = 0;
- if (!ns_capable(user_ns, CAP_NET_ADMIN)) {
+ if (!may_report_filterinfo) {
nla_reserve(skb, attrtype, 0);
return 0;
}
rcu_read_lock();
-
filter = rcu_dereference(sk->sk_filter);
- len = filter ? filter->len * sizeof(struct sock_filter) : 0;
+ if (!filter)
+ goto out;
- attr = nla_reserve(skb, attrtype, len);
+ fprog = filter->prog->orig_prog;
+ flen = bpf_classic_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
if (attr == NULL) {
err = -EMSGSIZE;
goto out;
}
- if (filter) {
- struct sock_filter *fb = (struct sock_filter *)nla_data(attr);
- int i;
-
- for (i = 0; i < filter->len; i++, fb++)
- sk_decode_filter(&filter->insns[i], fb);
- }
-
+ memcpy(nla_data(attr), fprog->filter, flen);
out:
rcu_read_unlock();
return err;
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 661b5a40ec10..a8770391ea5b 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -23,16 +23,11 @@
#include <linux/skbuff.h>
#include <linux/export.h>
-static struct sock_filter ptp_filter[] = {
- PTP_FILTER
-};
-
static unsigned int classify(const struct sk_buff *skb)
{
- if (likely(skb->dev &&
- skb->dev->phydev &&
+ if (likely(skb->dev && skb->dev->phydev &&
skb->dev->phydev->drv))
- return sk_run_filter(skb, ptp_filter);
+ return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
}
@@ -48,29 +43,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
return;
type = classify(skb);
+ if (type == PTP_CLASS_NONE)
+ return;
- switch (type) {
- case PTP_CLASS_V1_IPV4:
- case PTP_CLASS_V1_IPV6:
- case PTP_CLASS_V2_IPV4:
- case PTP_CLASS_V2_IPV6:
- case PTP_CLASS_V2_L2:
- case PTP_CLASS_V2_VLAN:
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->txtstamp)) {
- if (!atomic_inc_not_zero(&sk->sk_refcnt))
- return;
- clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone) {
- sock_put(sk);
- return;
- }
- clone->sk = sk;
- phydev->drv->txtstamp(phydev, clone, type);
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->txtstamp)) {
+ if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ return;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return;
}
- break;
- default:
- break;
+
+ clone->sk = sk;
+ phydev->drv->txtstamp(phydev, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
@@ -89,12 +77,15 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
}
*skb_hwtstamps(skb) = *hwtstamps;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
skb->sk = NULL;
+
err = sock_queue_err_skb(sk, skb);
+
sock_put(sk);
if (err)
kfree_skb(skb);
@@ -114,26 +105,13 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
__skb_pull(skb, ETH_HLEN);
- switch (type) {
- case PTP_CLASS_V1_IPV4:
- case PTP_CLASS_V1_IPV6:
- case PTP_CLASS_V2_IPV4:
- case PTP_CLASS_V2_IPV6:
- case PTP_CLASS_V2_L2:
- case PTP_CLASS_V2_VLAN:
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->rxtstamp))
- return phydev->drv->rxtstamp(phydev, skb, type);
- break;
- default:
- break;
- }
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->rxtstamp))
+ return phydev->drv->rxtstamp(phydev, skb, type);
return false;
}
EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
-
-void __init skb_timestamping_init(void)
-{
- BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
-}
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 000000000000..8c3203c585b0
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,77 @@
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/tso.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ iph = (struct iphdr *)(hdr + mac_hdr_len);
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ tcph->seq = htonl(tso->tcp_seq);
+ tso->ip_id++;
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
index 2f737bf90b3f..eed34338736c 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -348,8 +348,8 @@ static void __net_random_once_deferred(struct work_struct *w)
{
struct __net_random_once_work *work =
container_of(w, struct __net_random_once_work, work);
- if (!static_key_enabled(work->key))
- static_key_slow_inc(work->key);
+ BUG_ON(!static_key_enabled(work->key));
+ static_key_slow_dec(work->key);
kfree(work);
}
@@ -367,7 +367,7 @@ static void __net_random_once_disable_jump(struct static_key *key)
}
bool __net_get_random_once(void *buf, int nbytes, bool *done,
- struct static_key *done_key)
+ struct static_key *once_key)
{
static DEFINE_SPINLOCK(lock);
unsigned long flags;
@@ -382,7 +382,7 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done,
*done = true;
spin_unlock_irqrestore(&lock, flags);
- __net_random_once_disable_jump(done_key);
+ __net_random_once_disable_jump(once_key);
return true;
}