diff options
Diffstat (limited to 'net/ipv6/route.c')
-rw-r--r-- | net/ipv6/route.c | 1221 |
1 files changed, 634 insertions, 587 deletions
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0302e0eb07af..23a20d62daac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -59,7 +59,7 @@ #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> -#include <net/nexthop.h> +#include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> @@ -102,14 +102,15 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict); static size_t rt6_nlmsg_size(struct fib6_info *rt); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr); @@ -295,7 +296,7 @@ static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, - .fib6_ref = ATOMIC_INIT(1), + .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; @@ -379,11 +380,8 @@ static void ip6_dst_destroy(struct dst_entry *dst) in6_dev_put(idev); } - rcu_read_lock(); - from = rcu_dereference(rt->from); - rcu_assign_pointer(rt->from, NULL); + from = xchg((__force struct fib6_info **)&rt->from, NULL); fib6_info_release(from); - rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -427,13 +425,15 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -struct fib6_info *fib6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +void fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict) { struct fib6_info *sibling, *next_sibling; + struct fib6_info *match = res->f6i; + + if (!match->fib6_nsiblings || have_oif_match) + goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -441,61 +441,89 @@ struct fib6_info *fib6_multipath_select(const struct net *net, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) - return match; + if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { + const struct fib6_nh *nh = &sibling->fib6_nh; int nh_upper_bound; - nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (fl6->mp_hash > nh_upper_bound) continue; - if (rt6_score_route(sibling, oif, strict) < 0) + if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } - return match; +out: + res->f6i = match; + res->nh = &match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ -static inline struct fib6_info *rt6_device_match(struct net *net, - struct fib6_info *rt, - const struct in6_addr *saddr, - int oif, - int flags) +static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, + const struct in6_addr *saddr, int oif, int flags) { - struct fib6_info *sprt; + const struct net_device *dev; - if (!oif && ipv6_addr_any(saddr) && - !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) - return rt; + if (nh->fib_nh_flags & RTNH_F_DEAD) + return false; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { - const struct net_device *dev = sprt->fib6_nh.nh_dev; + dev = nh->fib_nh_dev; + if (oif) { + if (dev->ifindex == oif) + return true; + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return true; + } - if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) - continue; + return false; +} - if (oif) { - if (dev->ifindex == oif) - return sprt; - } else { - if (ipv6_chk_addr(net, saddr, dev, - flags & RT6_LOOKUP_F_IFACE)) - return sprt; +static void rt6_device_match(struct net *net, struct fib6_result *res, + const struct in6_addr *saddr, int oif, int flags) +{ + struct fib6_info *f6i = res->f6i; + struct fib6_info *spf6i; + struct fib6_nh *nh; + + if (!oif && ipv6_addr_any(saddr)) { + nh = &f6i->fib6_nh; + if (!(nh->fib_nh_flags & RTNH_F_DEAD)) + goto out; + } + + for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { + nh = &spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) { + res->f6i = spf6i; + goto out; } } - if (oif && flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.fib6_null_entry; + if (oif && flags & RT6_LOOKUP_F_IFACE) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + goto out; + } - return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; + nh = &f6i->fib6_nh; + if (nh->fib_nh_flags & RTNH_F_DEAD) { + res->f6i = net->ipv6.fib6_null_entry; + nh = &res->f6i->fib6_nh; + } +out: + res->nh = nh; + res->fib6_type = res->f6i->fib6_type; + res->fib6_flags = res->f6i->fib6_flags; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -517,7 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct fib6_info *rt) +static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; @@ -533,11 +561,11 @@ static void rt6_probe(struct fib6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) + if (fib6_nh->fib_nh_gw_family) return; - nh_gw = &rt->fib6_nh.nh_gw; - dev = rt->fib6_nh.nh_dev; + nh_gw = &fib6_nh->fib_nh_gw6; + dev = fib6_nh->fib_nh_dev; rcu_read_lock_bh(); idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); @@ -554,13 +582,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else if (time_after(jiffies, rt->last_probe + + } else if (time_after(jiffies, fib6_nh->last_probe + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { - rt->last_probe = jiffies; + fib6_nh->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); @@ -572,7 +600,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct fib6_info *rt) +static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif @@ -580,27 +608,14 @@ static inline void rt6_probe(struct fib6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct fib6_info *rt, int oif) -{ - const struct net_device *dev = rt->fib6_nh.nh_dev; - - if (!oif || dev->ifindex == oif) - return 2; - return 0; -} - -static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) +static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; - if (rt->fib6_flags & RTF_NONEXTHOP || - !(rt->fib6_flags & RTF_GATEWAY)) - return RT6_NUD_SUCCEED; - rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, - &rt->fib6_nh.nh_gw); + neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, + &fib6_nh->fib_nh_gw6); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -621,58 +636,44 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) return ret; } -static int rt6_score_route(struct fib6_info *rt, int oif, int strict) +static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, + int strict) { - int m; + int m = 0; + + if (!oif || nh->fib_nh_dev->ifindex == oif) + m = 2; - m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif - if (strict & RT6_LOOKUP_F_REACHABLE) { - int n = rt6_check_neigh(rt); + if ((strict & RT6_LOOKUP_F_REACHABLE) && + !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { + int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } -/* called with rc_read_lock held */ -static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) +static bool find_match(struct fib6_nh *nh, u32 fib6_flags, + int oif, int strict, int *mpri, bool *do_rr) { - const struct net_device *dev = fib6_info_nh_dev(f6i); + bool match_do_rr = false; bool rc = false; - - if (dev) { - const struct inet6_dev *idev = __in6_dev_get(dev); - - rc = !!idev->cnf.ignore_routes_with_linkdown; - } - - return rc; -} - -static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, - int *mpri, struct fib6_info *match, - bool *do_rr) -{ int m; - bool match_do_rr = false; - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) + if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; - if (fib6_ignore_linkdown(rt) && - rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && + if (ip6_ignore_linkdown(nh->fib_nh_dev) && + nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (fib6_check_expired(rt)) - goto out; - - m = rt6_score_route(rt, oif, strict); + m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ @@ -681,67 +682,82 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, } if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(rt); + rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; - match = rt; + rc = true; } out: - return match; + return rc; } -static struct fib6_info *find_rr_leaf(struct fib6_node *fn, - struct fib6_info *leaf, - struct fib6_info *rr_head, - u32 metric, int oif, int strict, - bool *do_rr) +static void __find_rr_leaf(struct fib6_info *f6i_start, + struct fib6_info *nomatch, u32 metric, + struct fib6_result *res, struct fib6_info **cont, + int oif, int strict, bool *do_rr, int *mpri) { - struct fib6_info *rt, *match, *cont; - int mpri = -1; + struct fib6_info *f6i; - match = NULL; - cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + for (f6i = f6i_start; + f6i && f6i != nomatch; + f6i = rcu_dereference(f6i->fib6_next)) { + struct fib6_nh *nh; + + if (cont && f6i->fib6_metric != metric) { + *cont = f6i; + return; } - match = find_match(rt, oif, strict, &mpri, match, do_rr); - } + if (fib6_check_expired(f6i)) + continue; - for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->fib6_next)) { - if (rt->fib6_metric != metric) { - cont = rt; - break; + nh = &f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + res->f6i = f6i; + res->nh = nh; + res->fib6_flags = f6i->fib6_flags; + res->fib6_type = f6i->fib6_type; } - - match = find_match(rt, oif, strict, &mpri, match, do_rr); } +} + +static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, + struct fib6_info *rr_head, int oif, int strict, + bool *do_rr, struct fib6_result *res) +{ + u32 metric = rr_head->fib6_metric; + struct fib6_info *cont = NULL; + int mpri = -1; - if (match || !cont) - return match; + __find_rr_leaf(rr_head, NULL, metric, res, &cont, + oif, strict, do_rr, &mpri); - for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) - match = find_match(rt, oif, strict, &mpri, match, do_rr); + __find_rr_leaf(leaf, rr_head, metric, res, &cont, + oif, strict, do_rr, &mpri); - return match; + if (res->f6i || !cont) + return; + + __find_rr_leaf(cont, NULL, metric, res, NULL, + oif, strict, do_rr, &mpri); } -static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, - int oif, int strict) +static void rt6_select(struct net *net, struct fib6_node *fn, int oif, + struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); - struct fib6_info *match, *rt0; + struct fib6_info *rt0; bool do_rr = false; int key_plen; + /* make sure this function or its helpers sets f6i */ + res->f6i = NULL; + if (!leaf || leaf == net->ipv6.fib6_null_entry) - return net->ipv6.fib6_null_entry; + goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -758,11 +774,9 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.fib6_null_entry; - - match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, - &do_rr); + goto out; + find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); @@ -779,12 +793,19 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, } } - return match ? match : net->ipv6.fib6_null_entry; +out: + if (!res->f6i) { + res->f6i = net->ipv6.fib6_null_entry; + res->nh = &res->f6i->fib6_nh; + res->fib6_flags = res->f6i->fib6_flags; + res->fib6_type = res->f6i->fib6_type; + } } -static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { - return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); + return (res->f6i->fib6_flags & RTF_NONEXTHOP) || + res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -868,17 +889,17 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, */ /* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) +static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { - struct net_device *dev = rt->fib6_nh.nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; - if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { + if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->fib6_dst.addr)) + !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; @@ -924,11 +945,11 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt) return flags; } -static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { - rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); + rt->dst.error = ip6_rt_type_to_error(fib6_type); - switch (ort->fib6_type) { + switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; @@ -946,26 +967,28 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) } } -static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) +static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { - if (ort->fib6_flags & RTF_REJECT) { - ip6_rt_init_dst_reject(rt, ort); + struct fib6_info *f6i = res->f6i; + + if (res->fib6_flags & RTF_REJECT) { + ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; - if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) { + if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; - } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { + } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; } else { rt->dst.input = ip6_forward; } - if (ort->fib6_nh.nh_lwtstate) { - rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); + if (res->nh->fib_nh_lws) { + rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } @@ -980,20 +1003,25 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } -/* Caller must already hold reference to @ort */ -static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) +/* Caller must already hold reference to f6i in result */ +static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { - struct net_device *dev = fib6_info_nh_dev(ort); + const struct fib6_nh *nh = res->nh; + const struct net_device *dev = nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; - ip6_rt_init_dst(rt, ort); + ip6_rt_init_dst(rt, res); - rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; - rt->rt6i_gateway = ort->fib6_nh.nh_gw; - rt->rt6i_flags = ort->fib6_flags; - rt6_set_from(rt, ort); + rt->rt6i_flags = res->fib6_flags; + if (nh->fib_nh_gw_family) { + rt->rt6i_gateway = nh->fib_nh_gw6; + rt->rt6i_flags |= RTF_GATEWAY; + } + rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->fib6_src; + rt->rt6i_src = f6i->fib6_src; #endif } @@ -1015,14 +1043,13 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } -static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, - bool null_fallback) +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; - if (null_fallback) { + if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { @@ -1033,22 +1060,24 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, } /* called with rcu_lock held */ -static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) +static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); - struct net_device *dev = rt->fib6_nh.nh_dev; + struct net_device *dev = res->nh->fib_nh_dev; + struct fib6_info *f6i = res->f6i; + unsigned short flags; struct rt6_info *nrt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) goto fallback; + flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { - fib6_info_release(rt); + fib6_info_release(f6i); goto fallback; } - ip6_rt_copy_init(nrt, rt); + ip6_rt_copy_init(nrt, res); return nrt; fallback: @@ -1063,7 +1092,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; @@ -1073,37 +1102,38 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - f6i = rcu_dereference(fn->leaf); - if (!f6i) { - f6i = net->ipv6.fib6_null_entry; - } else { - f6i = rt6_device_match(net, f6i, &fl6->saddr, - fl6->flowi6_oif, flags); - if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = fib6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, - flags); - } - if (f6i == net->ipv6.fib6_null_entry) { + res.f6i = rcu_dereference(fn->leaf); + if (!res.f6i) + res.f6i = net->ipv6.fib6_null_entry; + else + rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, + flags); + + if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; + + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + goto out; } - trace_fib6_table_lookup(net, f6i, table, fl6); + fib6_select_path(net, &res, fl6, fl6->flowi6_oif, + fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt, true)) + if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); - } else if (f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); } else { - rt = ip6_create_rt_rcu(f6i); + rt = ip6_create_rt_rcu(&res); } +out: + trace_fib6_table_lookup(net, &res, table, fl6); + rcu_read_unlock(); return rt; @@ -1169,10 +1199,11 @@ int ip6_ins_rt(struct net *net, struct fib6_info *rt) return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; @@ -1180,25 +1211,25 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, * Clone the route. */ - if (!fib6_info_hold_safe(ort)) + if (!fib6_info_hold_safe(f6i)) return NULL; - dev = ip6_rt_get_dev_rcu(ort); + dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { - fib6_info_release(ort); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(rt, ort); + ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; - if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->fib6_dst.plen != 128 && - ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) + if (!rt6_is_gw_or_nonexthop(res)) { + if (f6i->fib6_dst.plen != 128 && + ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1211,55 +1242,56 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { - unsigned short flags = fib6_info_dst_flags(rt); + struct fib6_info *f6i = res->f6i; + unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; - if (!fib6_info_hold_safe(rt)) + if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); - dev = ip6_rt_get_dev_rcu(rt); + dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) { - fib6_info_release(rt); + fib6_info_release(f6i); return NULL; } - ip6_rt_copy_init(pcpu_rt, rt); + ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt, **p; - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); pcpu_rt = *p; if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt, false); + ip6_hold_safe(NULL, &pcpu_rt); return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, - struct fib6_info *rt) + const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; - pcpu_rt = ip6_rt_pcpu_alloc(rt); + pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) { dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(rt->rt6i_pcpu); + p = this_cpu_ptr(res->f6i->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1288,9 +1320,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = rcu_dereference_protected(rt6_ex->rt6i->from, - lockdep_is_held(&rt6_exception_lock)); - rcu_assign_pointer(rt6_ex->rt6i->from, NULL); + from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL); fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); @@ -1402,14 +1432,15 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } -static unsigned int fib6_mtu(const struct fib6_info *rt) +static unsigned int fib6_mtu(const struct fib6_result *res) { + const struct fib6_nh *nh = res->nh; unsigned int mtu; - if (rt->fib6_pmtu) { - mtu = rt->fib6_pmtu; + if (res->f6i->fib6_pmtu) { + mtu = res->f6i->fib6_pmtu; } else { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); @@ -1420,26 +1451,27 @@ static unsigned int fib6_mtu(const struct fib6_info *rt) mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } static int rt6_insert_exception(struct rt6_info *nrt, - struct fib6_info *ort) + const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; + struct fib6_info *f6i = res->f6i; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (ort->exception_bucket_flushed) { + if (f6i->exception_bucket_flushed) { err = -EINVAL; goto out; } - bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), @@ -1448,24 +1480,24 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); } #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates ort is in subtree + /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (ort->fib6_src.plen) + if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif - /* rt6_mtu_change() might lower mtu on ort. + /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu - * is less than ort's mtu value. + * is less than f6i's mtu value. */ - if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } @@ -1494,9 +1526,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->fib6_table->tb6_lock); - fib6_update_sernum(net, ort); - spin_unlock_bh(&ort->fib6_table->tb6_lock); + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1533,33 +1565,33 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, +static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, struct in6_addr *daddr, struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct rt6_info *res = NULL; + struct rt6_info *ret = NULL; - bucket = rcu_dereference(rt->rt6i_exception_bucket); + bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES - /* rt6i_src.plen != 0 indicates rt is in subtree + /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of - * both rt6i_dst and rt6i_src. + * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by - * a hash of only rt6i_dst. + * a hash of only fib6_dst. */ - if (rt->fib6_src.plen) + if (res->f6i->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) - res = rt6_ex->rt6i; + ret = rt6_ex->rt6i; - return res; + return ret; } /* Remove the passed in cached rt from the hash table that contains it */ @@ -1807,11 +1839,10 @@ void rt6_age_exceptions(struct fib6_info *rt, } /* must be called with rcu lock held */ -struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int strict) +int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; - struct fib6_info *f6i; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1820,8 +1851,8 @@ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - f6i = rt6_select(net, fn, oif, strict); - if (f6i == net->ipv6.fib6_null_entry) { + rt6_select(net, fn, oif, res, strict); + if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1833,16 +1864,16 @@ redo_rt6_select: } } - trace_fib6_table_lookup(net, f6i, table, fl6); + trace_fib6_table_lookup(net, res, table, fl6); - return f6i; + return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { - struct fib6_info *f6i; + struct fib6_result res = {}; struct rt6_info *rt; int strict = 0; @@ -1853,27 +1884,26 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); - f6i = fib6_table_lookup(net, table, oif, fl6, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); - - if (f6i == net->ipv6.fib6_null_entry) { + fib6_table_lookup(net, table, oif, fl6, &res, strict); + if (res.f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); return rt; } + fib6_select_path(net, &res, fl6, oif, false, skb, strict); + /*Search through exception table */ - rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt, true)) + if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); rcu_read_unlock(); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(f6i->fib6_flags & RTF_GATEWAY))) { + !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different @@ -1881,7 +1911,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, */ struct rt6_info *uncached_rt; - uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); + uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); rcu_read_unlock(); @@ -1903,10 +1933,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, struct rt6_info *pcpu_rt; local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(f6i); + pcpu_rt = rt6_get_pcpu_route(&res); if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, f6i); + pcpu_rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); rcu_read_unlock(); @@ -2325,15 +2355,23 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { - struct fib6_info *from; + struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); - from = rcu_dereference(rt6->from); - nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); + res.f6i = rcu_dereference(rt6->from); + if (!res.f6i) { + rcu_read_unlock(); + return; + } + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, from)) + if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } rcu_read_unlock(); @@ -2406,6 +2444,36 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, NULL); } +static bool ip6_redirect_nh_match(const struct fib6_result *res, + struct flowi6 *fl6, + const struct in6_addr *gw, + struct rt6_info **ret) +{ + const struct fib6_nh *nh = res->nh; + + if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || + fl6->flowi6_oif != nh->fib_nh_dev->ifindex) + return false; + + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { + *ret = rt_cache; + return true; + } + return false; + } + return true; +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2419,7 +2487,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *ret = NULL, *rt_cache; + struct rt6_info *ret = NULL; + struct fib6_result res = {}; struct fib6_info *rt; struct fib6_node *fn; @@ -2437,34 +2506,15 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) - continue; + res.f6i = rt; + res.nh = &rt->fib6_nh; + if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (!(rt->fib6_flags & RTF_GATEWAY)) - continue; - if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) - continue; - /* rt_cache's gateway might be different from its 'parent' - * in the case of an ip redirect. - * So we keep searching in the exception table if the gateway - * is different. - */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { - rt_cache = rt6_find_cached_rt(rt, - &fl6->daddr, - &fl6->saddr); - if (rt_cache && - ipv6_addr_equal(&rdfl->gateway, - &rt_cache->rt6i_gateway)) { - ret = rt_cache; - break; - } - continue; - } - break; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) + goto out; } if (!rt) @@ -2480,15 +2530,20 @@ restart: goto restart; } + res.f6i = rt; + res.nh = &rt->fib6_nh; out: - if (ret) - ip6_hold_safe(net, &ret, true); - else - ret = ip6_create_rt_rcu(rt); + if (ret) { + ip6_hold_safe(net, &ret); + } else { + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + ret = ip6_create_rt_rcu(&res); + } rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); + trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; @@ -2606,12 +2661,15 @@ out: * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ -u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr) +u32 ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_exception_bucket *bucket; + const struct fib6_nh *nh = res->nh; + struct fib6_info *f6i = res->f6i; + const struct in6_addr *src_key; struct rt6_exception *rt6_ex; - struct in6_addr *src_key; struct inet6_dev *idev; u32 mtu = 0; @@ -2633,7 +2691,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU); if (likely(!mtu)) { - struct net_device *dev = fib6_info_nh_dev(f6i); + struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); @@ -2643,7 +2701,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: - return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu); + return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, @@ -2899,17 +2957,143 @@ out: return err; } +static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) +{ + if ((flags & RTF_REJECT) || + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(flags & RTF_LOCAL))) + return true; + + return false; +} + +int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + int addr_type; + int err; + + fib6_nh->fib_nh_family = AF_INET6; + + err = -ENODEV; + if (cfg->fc_ifindex) { + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (cfg->fc_flags & RTNH_F_ONLINK) { + if (!dev) { + NL_SET_ERR_MSG(extack, + "Nexthop device required for onlink"); + goto out; + } + + if (!(dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + + fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; + } + + fib6_nh->fib_nh_weight = 1; + + /* We cannot add true routes via loopback here, + * they would result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != net->loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = net->loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + goto set_dev; + } + + if (cfg->fc_flags & RTF_GATEWAY) { + err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + if (err) + goto out; + + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + } + + err = -ENODEV; + if (!dev) + goto out; + + if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); + err = -EACCES; + goto out; + } + + if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } + + if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + + err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, + cfg->fc_encap_type, cfg, gfp_flags, extack); + if (err) + goto out; +set_dev: + fib6_nh->fib_nh_dev = dev; + fib6_nh->fib_nh_oif = dev->ifindex; + err = 0; +out: + if (idev) + in6_dev_put(idev); + + if (err) { + lwtstate_put(fib6_nh->fib_nh_lws); + fib6_nh->fib_nh_lws = NULL; + if (dev) + dev_put(dev); + } + + return err; +} + +void fib6_nh_release(struct fib6_nh *fib6_nh) +{ + fib_nh_common_release(&fib6_nh->nh_common); +} + static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; - struct net_device *dev = NULL; - struct inet6_dev *idev = NULL; struct fib6_table *table; - int addr_type; int err = -EINVAL; + int addr_type; /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { @@ -2943,33 +3127,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif - if (cfg->fc_ifindex) { - err = -ENODEV; - dev = dev_get_by_index(net, cfg->fc_ifindex); - if (!dev) - goto out; - idev = in6_dev_get(dev); - if (!idev) - goto out; - } - - if (cfg->fc_metric == 0) - cfg->fc_metric = IP6_RT_PRIO_USER; - - if (cfg->fc_flags & RTNH_F_ONLINK) { - if (!dev) { - NL_SET_ERR_MSG(extack, - "Nexthop device required for onlink"); - err = -ENODEV; - goto out; - } - - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } - } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3013,18 +3170,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; - addr_type = ipv6_addr_type(&cfg->fc_dst); - - if (cfg->fc_encap) { - struct lwtunnel_state *lwtstate; - - err = lwtunnel_build_state(cfg->fc_encap_type, - cfg->fc_encap, AF_INET6, cfg, - &lwtstate, extack); - if (err) - goto out; - rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); - } + rt->fib6_table = table; + rt->fib6_metric = cfg->fc_metric; + rt->fib6_type = cfg->fc_type; + rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; @@ -3035,62 +3184,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - - rt->fib6_metric = cfg->fc_metric; - rt->fib6_nh.nh_weight = 1; - - rt->fib6_type = cfg->fc_type; + err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; /* We cannot add true routes via loopback here, - they would result in kernel looping; promote them to reject routes + * they would result in kernel looping; promote them to reject routes */ - if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags & IFF_LOOPBACK) && - !(addr_type & IPV6_ADDR_LOOPBACK) && - !(cfg->fc_flags & RTF_LOCAL))) { - /* hold loopback dev/idev if we haven't done so. */ - if (dev != net->loopback_dev) { - if (dev) { - dev_put(dev); - in6_dev_put(idev); - } - dev = net->loopback_dev; - dev_hold(dev); - idev = in6_dev_get(dev); - if (!idev) { - err = -ENODEV; - goto out; - } - } - rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; - goto install_route; - } - - if (cfg->fc_flags & RTF_GATEWAY) { - err = ip6_validate_gw(net, cfg, &dev, &idev, extack); - if (err) - goto out; - - rt->fib6_nh.nh_gw = cfg->fc_gateway; - } - - err = -ENODEV; - if (!dev) - goto out; - - if (idev->cnf.disable_ipv6) { - NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); - err = -EACCES; - goto out; - } - - if (!(dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + struct net_device *dev = fib6_info_nh_dev(rt); + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; @@ -3101,26 +3208,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else rt->fib6_prefsrc.plen = 0; - rt->fib6_flags = cfg->fc_flags; - -install_route: - if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && - !netif_carrier_ok(dev)) - rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; - rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->fib6_nh.nh_dev = dev; - rt->fib6_table = table; - - if (idev) - in6_dev_put(idev); - return rt; out: - if (dev) - dev_put(dev); - if (idev) - in6_dev_put(idev); - fib6_info_release(rt); return ERR_PTR(err); } @@ -3261,10 +3350,16 @@ static int ip6_route_del(struct fib6_config *cfg, if (fn) { for_each_fib6_node_rt_rcu(fn) { + struct fib6_nh *nh; + if (cfg->fc_flags & RTF_CACHE) { + struct fib6_result res = { + .f6i = rt, + }; int rc; - rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + rt_cache = rt6_find_cached_rt(&res, + &cfg->fc_dst, &cfg->fc_src); if (rt_cache) { rc = ip6_del_cached_rt(rt_cache, cfg); @@ -3275,12 +3370,14 @@ static int ip6_route_del(struct fib6_config *cfg, } continue; } + + nh = &rt->fib6_nh; if (cfg->fc_ifindex && - (!rt->fib6_nh.nh_dev || - rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) + (!nh->fib_nh_dev || + nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && - !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) + !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; @@ -3306,10 +3403,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; + struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; - struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3392,14 +3489,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NDISC_REDIRECT, &ndopts); rcu_read_lock(); - from = rcu_dereference(rt->from); - /* This fib6_info_hold() is safe here because we hold reference to rt - * and rt already holds reference to fib6_info. - */ - fib6_info_hold(from); - rcu_read_unlock(); + res.f6i = rcu_dereference(rt->from); + if (!res.f6i) + goto out; - nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); + res.nh = &res.f6i->fib6_nh; + res.fib6_flags = res.f6i->fib6_flags; + res.fib6_type = res.f6i->fib6_type; + nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; @@ -3409,11 +3506,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - /* No need to remove rt from the exception table if rt is - * a cached route because rt6_insert_exception() will - * takes care of it - */ - if (rt6_insert_exception(nrt, from)) { + /* rt6_insert_exception() will take care of duplicated exceptions */ + if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3425,7 +3519,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: - fib6_info_release(from); + rcu_read_unlock(); neigh_release(neigh); } @@ -3451,11 +3545,12 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.nh_dev->ifindex != ifindex) + if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) continue; - if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + if (!(rt->fib6_flags & RTF_ROUTEINFO) || + !rt->fib6_nh.fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3513,9 +3608,11 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (dev == rt->fib6_nh.nh_dev && + struct fib6_nh *nh = &rt->fib6_nh; + + if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && - ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) + ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) @@ -3606,7 +3703,7 @@ static void rtmsg_to_fib6_config(struct net *net, .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, - .fc_metric = rtmsg->rtmsg_metric, + .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, @@ -3664,23 +3761,34 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { - int type; struct dst_entry *dst = skb_dst(skb); + struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; + int type; + + if (netif_is_l3_master(skb->dev) && + dst->dev == net->loopback_dev) + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); + else + idev = ip6_dst_idev(dst); + switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), - __in6_dev_get_safely(skb->dev), - IPSTATS_MIB_INADDRERRORS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } /* FALLTHROUGH */ case IPSTATS_MIB_OUTNOROUTES: - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), - ipstats_mib_noroutes); + IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } + + /* Start over by dropping the dst for l3mdev case */ + if (netif_is_l3_master(skb->dev)) + skb_dst_drop(skb); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb(skb); return 0; @@ -3717,36 +3825,26 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags) { - u32 tb_id; - struct net_device *dev = idev->dev; - struct fib6_info *f6i; - - f6i = fib6_info_alloc(gfp_flags); - if (!f6i) - return ERR_PTR(-ENOMEM); + struct fib6_config cfg = { + .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, + .fc_ifindex = idev->dev->ifindex, + .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP, + .fc_dst = *addr, + .fc_dst_len = 128, + .fc_protocol = RTPROT_KERNEL, + .fc_nlinfo.nl_net = net, + .fc_ignore_dev_down = true, + }; - f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL); - f6i->dst_nocount = true; - f6i->dst_host = true; - f6i->fib6_protocol = RTPROT_KERNEL; - f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) { - f6i->fib6_type = RTN_ANYCAST; - f6i->fib6_flags |= RTF_ANYCAST; + cfg.fc_type = RTN_ANYCAST; + cfg.fc_flags |= RTF_ANYCAST; } else { - f6i->fib6_type = RTN_LOCAL; - f6i->fib6_flags |= RTF_LOCAL; + cfg.fc_type = RTN_LOCAL; + cfg.fc_flags |= RTF_LOCAL; } - f6i->fib6_nh.nh_gw = *addr; - dev_hold(dev); - f6i->fib6_nh.nh_dev = dev; - f6i->fib6_dst.addr = *addr; - f6i->fib6_dst.plen = 128; - tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - f6i->fib6_table = fib6_get_table(net, tb_id); - - return f6i; + return ip6_route_info_create(&cfg, gfp_flags, NULL); } /* remove deleted ip from prefsrc entries */ @@ -3762,7 +3860,7 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && + if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3784,7 +3882,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) fib6_clean_all(net, fib6_remove_prefsrc, &adni); } -#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) +#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) @@ -3792,7 +3890,8 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) struct in6_addr *gateway = (struct in6_addr *)arg; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { + rt->fib6_nh.fib_nh_gw_family && + ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { return -1; } @@ -3813,7 +3912,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) struct arg_netdev_event { const struct net_device *dev; union { - unsigned int nh_flags; + unsigned char nh_flags; unsigned long event; }; }; @@ -3840,9 +3939,9 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && - fib6_ignore_linkdown(rt))) + if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) return true; return false; @@ -3854,11 +3953,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.nh_weight; + total += rt->fib6_nh.fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.nh_weight; + total += iter->fib6_nh.fib_nh_weight; } return total; @@ -3869,11 +3968,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.nh_weight; + *weight += rt->fib6_nh.fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -3916,8 +4015,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { - rt->fib6_nh.nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && + rt->fib6_nh.fib_nh_dev == arg->dev) { + rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3925,7 +4025,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) return 0; } -void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, @@ -3945,10 +4045,10 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.nh_dev == dev) + if (rt->fib6_nh.fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == dev) + if (iter->fib6_nh.fib_nh_dev == dev) return true; return false; @@ -3969,12 +4069,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.nh_dev == down_dev || - rt->fib6_nh.nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.fib_nh_dev == down_dev || + rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == down_dev || - iter->fib6_nh.nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh.fib_nh_dev == down_dev || + iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -3982,15 +4082,15 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, - unsigned int nh_flags) + unsigned char nh_flags) { struct fib6_info *iter; - if (rt->fib6_nh.nh_dev == dev) - rt->fib6_nh.nh_flags |= nh_flags; + if (rt->fib6_nh.fib_nh_dev == dev) + rt->fib6_nh.fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.nh_dev == dev) - iter->fib6_nh.nh_flags |= nh_flags; + if (iter->fib6_nh.fib_nh_dev == dev) + iter->fib6_nh.fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4005,12 +4105,12 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.nh_dev == dev ? -1 : 0; + return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.nh_dev == dev ? -1 : 0; + return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4026,10 +4126,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.nh_dev != dev || + if (rt->fib6_nh.fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4085,7 +4185,7 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - if (rt->fib6_nh.nh_dev == arg->dev && + if (rt->fib6_nh.fib_nh_dev == arg->dev && !fib6_metric_locked(rt, RTAX_MTU)) { u32 mtu = rt->fib6_pmtu; @@ -4139,8 +4239,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int pref; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, - extack); + err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); if (err < 0) goto errout; @@ -4376,7 +4476,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4526,6 +4626,9 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_metric == 0) + cfg.fc_metric = IP6_RT_PRIO_USER; + if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else @@ -4540,7 +4643,7 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); + + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); nexthop_len *= rt->fib6_nsiblings; } @@ -4558,77 +4661,10 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) + + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } -static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, - unsigned int *flags, bool skip_oif) -{ - if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) - *flags |= RTNH_F_DEAD; - - if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { - *flags |= RTNH_F_LINKDOWN; - - rcu_read_lock(); - if (fib6_ignore_linkdown(rt)) - *flags |= RTNH_F_DEAD; - rcu_read_unlock(); - } - - if (rt->fib6_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) - goto nla_put_failure; - } - - *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); - if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) - *flags |= RTNH_F_OFFLOAD; - - /* not needed for multipath encoding b/c it has a rtnexthop struct */ - if (!skip_oif && rt->fib6_nh.nh_dev && - nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) - goto nla_put_failure; - - if (rt->fib6_nh.nh_lwtstate && - lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -/* add multipath next hop */ -static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) -{ - const struct net_device *dev = rt->fib6_nh.nh_dev; - struct rtnexthop *rtnh; - unsigned int flags = 0; - - rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); - if (!rtnh) - goto nla_put_failure; - - rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; - rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; - - if (rt6_nexthop_info(skb, rt, &flags, true) < 0) - goto nla_put_failure; - - rtnh->rtnh_flags = flags; - - /* length of rtnetlink header + attributes */ - rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4741,23 +4777,30 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *sibling, *next_sibling; struct nlattr *mp; - mp = nla_nest_start(skb, RTA_MULTIPATH); + mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; - if (rt6_add_nexthop(skb, rt) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, + rt->fib6_nh.fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (rt6_add_nexthop(skb, sibling) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, + sibling->fib6_nh.fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); } else { - if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) + unsigned char nh_flags = 0; + + if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + &nh_flags, false) < 0) goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { @@ -4783,7 +4826,7 @@ nla_put_failure: static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.nh_dev == dev) + if (f6i->fib6_nh.fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4791,7 +4834,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.nh_dev == dev) + if (sibling->fib6_nh.fib_nh_dev == dev) return true; } } @@ -4843,8 +4886,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, } if (!netlink_strict_get_check(skb)) - return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv6_policy, extack); + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || @@ -4860,8 +4903,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } - err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, - rtm_ipv6_policy, extack); + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv6_policy, extack); if (err) return err; @@ -5013,16 +5056,20 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); from = rcu_dereference(rt->from); - - if (fibmatch) - err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); - else - err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, - &fl6.saddr, iif, RTM_NEWROUTE, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - 0); + if (from) { + if (fibmatch) + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, + iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + else + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); + } else { + err = -ENETUNREACH; + } rcu_read_unlock(); if (err < 0) { @@ -5076,7 +5123,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5411,7 +5458,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES |