diff options
author | David S. Miller <davem@davemloft.net> | 2018-01-10 15:14:45 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-01-10 15:14:45 -0500 |
commit | 8448f91fcdf8d028e40ec526b39ded01e4dafc11 (patch) | |
tree | 1af7f7291b37d5e8c3106540e06a041d9f563d83 | |
parent | e2b3b35eb9896f26c98b9a2c047d9111638059a2 (diff) | |
parent | 398958ae48f44bb036d0fa9829cd489270bf1fc2 (diff) |
Merge branch 'ipv6-Add-support-for-non-equal-cost-multipath'
Ido Schimmel says:
====================
ipv6: Add support for non-equal-cost multipath
This set aims to add support for IPv6 non-equal-cost multipath routes.
The first three patches convert multipath selection to use the
hash-threshold method (RFC 2992) instead of modulo-N. The same method is
employed by the IPv4 routing code since commit 0e884c78ee19 ("ipv4: L3
hash-based multipath").
Unlike modulo-N, with hash-threshold only the flows near the region
boundaries are affected when a nexthop is added or removed. In addition,
it allows us to easily add support for non-equal-cost multipath in the
last patch by sizing the different regions according to the provided
weights.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/ip6_fib.h | 2 | ||||
-rw-r--r-- | include/net/ip6_route.h | 7 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 8 | ||||
-rw-r--r-- | net/ipv6/route.c | 141 |
4 files changed, 126 insertions, 32 deletions
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ddf53dd1e948..34ec321d6a03 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -149,6 +149,7 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; + atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; @@ -170,6 +171,7 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ + int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 34cd3b0c6ded..27d23a65f3cd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, @@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); +void rt6_multipath_rebalance(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b5f19703fca6..e31118f417b4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -796,12 +796,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -991,6 +985,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1054b059747f..1076ae0ea9d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; - - if (sibling->rt6i_nh_flags & RTNH_F_DEAD) - break; - if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + return match; } @@ -1833,10 +1823,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -2604,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -3481,6 +3472,99 @@ struct arg_netdev_event { }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total += rt->rt6i_nh_weight; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total += iter->rt6i_nh_weight; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + *weight += rt->rt6i_nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + static int fib6_ifup(struct rt6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; @@ -3489,6 +3573,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { rt->rt6i_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); } return 0; @@ -3588,6 +3673,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: @@ -3595,6 +3681,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); break; } @@ -3938,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -4160,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) |