2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <asm/uaccess.h>
63 #include <linux/sysctl.h>
66 /* Set to 3 to get tracing. */
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
74 #define RT6_TRACE(x...) do { ; } while (0)
77 #define CLONE_OFFLINK_ROUTE 0
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void ip6_dst_destroy(struct dst_entry *);
91 static void ip6_dst_ifdown(struct dst_entry *,
92 struct net_device *dev, int how);
93 static int ip6_dst_gc(void);
95 static int ip6_pkt_discard(struct sk_buff *skb);
96 static int ip6_pkt_discard_out(struct sk_buff *skb);
97 static void ip6_link_failure(struct sk_buff *skb);
98 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 struct in6_addr *gwaddr, int ifindex,
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 struct in6_addr *gwaddr, int ifindex);
108 static struct dst_ops ip6_dst_ops = {
110 .protocol = __constant_htons(ETH_P_IPV6),
113 .check = ip6_dst_check,
114 .destroy = ip6_dst_destroy,
115 .ifdown = ip6_dst_ifdown,
116 .negative_advice = ip6_negative_advice,
117 .link_failure = ip6_link_failure,
118 .update_pmtu = ip6_rt_update_pmtu,
119 .entry_size = sizeof(struct rt6_info),
122 struct rt6_info ip6_null_entry = {
125 .__refcnt = ATOMIC_INIT(1),
127 .dev = &loopback_dev,
129 .error = -ENETUNREACH,
130 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
131 .input = ip6_pkt_discard,
132 .output = ip6_pkt_discard_out,
134 .path = (struct dst_entry*)&ip6_null_entry,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_metric = ~(u32) 0,
139 .rt6i_ref = ATOMIC_INIT(1),
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 struct rt6_info ip6_prohibit_entry = {
147 .__refcnt = ATOMIC_INIT(1),
149 .dev = &loopback_dev,
152 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
153 .input = ip6_pkt_discard,
154 .output = ip6_pkt_discard_out,
156 .path = (struct dst_entry*)&ip6_prohibit_entry,
159 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
160 .rt6i_metric = ~(u32) 0,
161 .rt6i_ref = ATOMIC_INIT(1),
164 struct rt6_info ip6_blk_hole_entry = {
167 .__refcnt = ATOMIC_INIT(1),
169 .dev = &loopback_dev,
172 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
173 .input = ip6_pkt_discard,
174 .output = ip6_pkt_discard_out,
176 .path = (struct dst_entry*)&ip6_blk_hole_entry,
179 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
192 static void ip6_dst_destroy(struct dst_entry *dst)
194 struct rt6_info *rt = (struct rt6_info *)dst;
195 struct inet6_dev *idev = rt->rt6i_idev;
198 rt->rt6i_idev = NULL;
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 struct rt6_info *rt = (struct rt6_info *)dst;
207 struct inet6_dev *idev = rt->rt6i_idev;
209 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211 if (loopback_idev != NULL) {
212 rt->rt6i_idev = loopback_idev;
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 return (rt->rt6i_flags & RTF_EXPIRES &&
221 time_after(jiffies, rt->rt6i_expires));
224 static inline int rt6_need_strict(struct in6_addr *daddr)
226 return (ipv6_addr_type(daddr) &
227 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
231 * Route lookup. Any table->tb6_lock is implied.
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
238 struct rt6_info *local = NULL;
239 struct rt6_info *sprt;
242 for (sprt = rt; sprt; sprt = sprt->u.next) {
243 struct net_device *dev = sprt->rt6i_dev;
244 if (dev->ifindex == oif)
246 if (dev->flags & IFF_LOOPBACK) {
247 if (sprt->rt6i_idev == NULL ||
248 sprt->rt6i_idev->dev->ifindex != oif) {
251 if (local && (!oif ||
252 local->rt6i_idev->dev->ifindex == oif))
263 return &ip6_null_entry;
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
271 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 * Okay, this does not seem to be appropriate
274 * for now, however, we need to check if it
275 * is really so; aka Router Reachability Probing.
277 * Router Reachability Probe MUST be rate-limited
278 * to no more than one per minute.
280 if (!neigh || (neigh->nud_state & NUD_VALID))
282 read_lock_bh(&neigh->lock);
283 if (!(neigh->nud_state & NUD_VALID) &&
284 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285 struct in6_addr mcaddr;
286 struct in6_addr *target;
288 neigh->updated = jiffies;
289 read_unlock_bh(&neigh->lock);
291 target = (struct in6_addr *)&neigh->primary_key;
292 addrconf_addr_solict_mult(target, &mcaddr);
293 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 read_unlock_bh(&neigh->lock);
298 static inline void rt6_probe(struct rt6_info *rt)
305 * Default Router Selection (RFC 2461 6.3.6)
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 struct net_device *dev = rt->rt6i_dev;
310 if (!oif || dev->ifindex == oif)
312 if ((dev->flags & IFF_LOOPBACK) &&
313 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318 static int inline rt6_check_neigh(struct rt6_info *rt)
320 struct neighbour *neigh = rt->rt6i_nexthop;
322 if (rt->rt6i_flags & RTF_NONEXTHOP ||
323 !(rt->rt6i_flags & RTF_GATEWAY))
326 read_lock_bh(&neigh->lock);
327 if (neigh->nud_state & NUD_VALID)
329 read_unlock_bh(&neigh->lock);
334 static int rt6_score_route(struct rt6_info *rt, int oif,
339 m = rt6_check_dev(rt, oif);
340 if (!m && (strict & RT6_LOOKUP_F_IFACE))
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 n = rt6_check_neigh(rt);
348 else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
356 struct rt6_info *match = NULL, *last = NULL;
357 struct rt6_info *rt, *rt0 = *head;
361 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362 __FUNCTION__, head, head ? *head : NULL, oif);
364 for (rt = rt0, metric = rt0->rt6i_metric;
365 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
369 if (rt6_check_expired(rt))
374 m = rt6_score_route(rt, oif, strict);
388 (strict & RT6_LOOKUP_F_REACHABLE) &&
389 last && last != rt0) {
390 /* no entries matched; do round-robin */
391 static DEFINE_SPINLOCK(lock);
394 rt0->u.next = last->u.next;
399 RT6_TRACE("%s() => %p, score=%d\n",
400 __FUNCTION__, match, mpri);
402 return (match ? match : &ip6_null_entry);
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407 struct in6_addr *gwaddr)
409 struct route_info *rinfo = (struct route_info *) opt;
410 struct in6_addr prefix_buf, *prefix;
415 if (len < sizeof(struct route_info)) {
419 /* Sanity check for prefix_len and length */
420 if (rinfo->length > 3) {
422 } else if (rinfo->prefix_len > 128) {
424 } else if (rinfo->prefix_len > 64) {
425 if (rinfo->length < 2) {
428 } else if (rinfo->prefix_len > 0) {
429 if (rinfo->length < 1) {
434 pref = rinfo->route_pref;
435 if (pref == ICMPV6_ROUTER_PREF_INVALID)
436 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 lifetime = htonl(rinfo->lifetime);
439 if (lifetime == 0xffffffff) {
441 } else if (lifetime > 0x7fffffff/HZ) {
442 /* Avoid arithmetic overflow */
443 lifetime = 0x7fffffff/HZ - 1;
446 if (rinfo->length == 3)
447 prefix = (struct in6_addr *)rinfo->prefix;
449 /* this function is safe */
450 ipv6_addr_prefix(&prefix_buf,
451 (struct in6_addr *)rinfo->prefix,
453 prefix = &prefix_buf;
456 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 if (rt && !lifetime) {
464 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
467 rt->rt6i_flags = RTF_ROUTEINFO |
468 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
471 if (lifetime == 0xffffffff) {
472 rt->rt6i_flags &= ~RTF_EXPIRES;
474 rt->rt6i_expires = jiffies + HZ * lifetime;
475 rt->rt6i_flags |= RTF_EXPIRES;
477 dst_release(&rt->u.dst);
483 #define BACKTRACK(saddr) \
485 if (rt == &ip6_null_entry) { \
486 struct fib6_node *pn; \
488 if (fn->fn_flags & RTN_TL_ROOT) \
491 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492 fn = fib6_lookup(pn->subtree, NULL, saddr); \
495 if (fn->fn_flags & RTN_RTINFO) \
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502 struct flowi *fl, int flags)
504 struct fib6_node *fn;
507 read_lock_bh(&table->tb6_lock);
508 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
511 rt = rt6_device_match(rt, fl->oif, flags);
512 BACKTRACK(&fl->fl6_src);
513 dst_hold(&rt->u.dst);
515 read_unlock_bh(&table->tb6_lock);
517 rt->u.dst.lastuse = jiffies;
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
536 struct dst_entry *dst;
537 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
539 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
541 return (struct rt6_info *) dst;
548 /* ip6_ins_rt is called with FREE table->tb6_lock.
549 It takes new route entry, the addition fails by any reason the
550 route is freed. In any case, if caller does not hold it, it may
554 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
557 struct fib6_table *table;
559 table = rt->rt6i_table;
560 write_lock_bh(&table->tb6_lock);
561 err = fib6_add(&table->tb6_root, rt, info);
562 write_unlock_bh(&table->tb6_lock);
567 int ip6_ins_rt(struct rt6_info *rt)
569 return __ip6_ins_rt(rt, NULL);
572 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
573 struct in6_addr *saddr)
581 rt = ip6_rt_copy(ort);
584 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
585 if (rt->rt6i_dst.plen != 128 &&
586 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
587 rt->rt6i_flags |= RTF_ANYCAST;
588 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
591 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
592 rt->rt6i_dst.plen = 128;
593 rt->rt6i_flags |= RTF_CACHE;
594 rt->u.dst.flags |= DST_HOST;
596 #ifdef CONFIG_IPV6_SUBTREES
597 if (rt->rt6i_src.plen && saddr) {
598 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
599 rt->rt6i_src.plen = 128;
603 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
610 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
612 struct rt6_info *rt = ip6_rt_copy(ort);
614 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
615 rt->rt6i_dst.plen = 128;
616 rt->rt6i_flags |= RTF_CACHE;
617 if (rt->rt6i_flags & RTF_REJECT)
618 rt->u.dst.error = ort->u.dst.error;
619 rt->u.dst.flags |= DST_HOST;
620 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
625 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
626 struct flowi *fl, int flags)
628 struct fib6_node *fn;
629 struct rt6_info *rt, *nrt;
633 int reachable = RT6_LOOKUP_F_REACHABLE;
635 strict |= flags & RT6_LOOKUP_F_IFACE;
638 read_lock_bh(&table->tb6_lock);
641 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
644 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
645 BACKTRACK(&fl->fl6_src);
646 if (rt == &ip6_null_entry ||
647 rt->rt6i_flags & RTF_CACHE)
650 dst_hold(&rt->u.dst);
651 read_unlock_bh(&table->tb6_lock);
653 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
654 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
656 #if CLONE_OFFLINK_ROUTE
657 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
663 dst_release(&rt->u.dst);
664 rt = nrt ? : &ip6_null_entry;
666 dst_hold(&rt->u.dst);
668 err = ip6_ins_rt(nrt);
677 * Race condition! In the gap, when table->tb6_lock was
678 * released someone could insert this route. Relookup.
680 dst_release(&rt->u.dst);
688 dst_hold(&rt->u.dst);
689 read_unlock_bh(&table->tb6_lock);
691 rt->u.dst.lastuse = jiffies;
697 void ip6_route_input(struct sk_buff *skb)
699 struct ipv6hdr *iph = skb->nh.ipv6h;
701 .iif = skb->dev->ifindex,
706 .fwmark = skb->nfmark,
707 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
710 .proto = iph->nexthdr,
712 int flags = rt6_need_strict(&iph->daddr) ? RT6_LOOKUP_F_IFACE : 0;
714 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
717 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
718 struct flowi *fl, int flags)
720 struct fib6_node *fn;
721 struct rt6_info *rt, *nrt;
725 int reachable = RT6_LOOKUP_F_REACHABLE;
727 strict |= flags & RT6_LOOKUP_F_IFACE;
730 read_lock_bh(&table->tb6_lock);
733 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
736 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
737 BACKTRACK(&fl->fl6_src);
738 if (rt == &ip6_null_entry ||
739 rt->rt6i_flags & RTF_CACHE)
742 dst_hold(&rt->u.dst);
743 read_unlock_bh(&table->tb6_lock);
745 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
746 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
748 #if CLONE_OFFLINK_ROUTE
749 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
755 dst_release(&rt->u.dst);
756 rt = nrt ? : &ip6_null_entry;
758 dst_hold(&rt->u.dst);
760 err = ip6_ins_rt(nrt);
769 * Race condition! In the gap, when table->tb6_lock was
770 * released someone could insert this route. Relookup.
772 dst_release(&rt->u.dst);
780 dst_hold(&rt->u.dst);
781 read_unlock_bh(&table->tb6_lock);
783 rt->u.dst.lastuse = jiffies;
788 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
792 if (rt6_need_strict(&fl->fl6_dst))
793 flags |= RT6_LOOKUP_F_IFACE;
795 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
800 * Destination cache support functions
803 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
807 rt = (struct rt6_info *) dst;
809 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
815 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
817 struct rt6_info *rt = (struct rt6_info *) dst;
820 if (rt->rt6i_flags & RTF_CACHE)
828 static void ip6_link_failure(struct sk_buff *skb)
832 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
834 rt = (struct rt6_info *) skb->dst;
836 if (rt->rt6i_flags&RTF_CACHE) {
837 dst_set_expires(&rt->u.dst, 0);
838 rt->rt6i_flags |= RTF_EXPIRES;
839 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
840 rt->rt6i_node->fn_sernum = -1;
844 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
846 struct rt6_info *rt6 = (struct rt6_info*)dst;
848 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
849 rt6->rt6i_flags |= RTF_MODIFIED;
850 if (mtu < IPV6_MIN_MTU) {
852 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
854 dst->metrics[RTAX_MTU-1] = mtu;
855 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
859 static int ipv6_get_mtu(struct net_device *dev);
861 static inline unsigned int ipv6_advmss(unsigned int mtu)
863 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
865 if (mtu < ip6_rt_min_advmss)
866 mtu = ip6_rt_min_advmss;
869 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
870 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
871 * IPV6_MAXPLEN is also valid and means: "any MSS,
872 * rely only on pmtu discovery"
874 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
879 static struct dst_entry *ndisc_dst_gc_list;
880 static DEFINE_SPINLOCK(ndisc_lock);
882 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
883 struct neighbour *neigh,
884 struct in6_addr *addr,
885 int (*output)(struct sk_buff *))
888 struct inet6_dev *idev = in6_dev_get(dev);
890 if (unlikely(idev == NULL))
893 rt = ip6_dst_alloc();
894 if (unlikely(rt == NULL)) {
903 neigh = ndisc_get_neigh(dev, addr);
906 rt->rt6i_idev = idev;
907 rt->rt6i_nexthop = neigh;
908 atomic_set(&rt->u.dst.__refcnt, 1);
909 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
910 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
911 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
912 rt->u.dst.output = output;
914 #if 0 /* there's no chance to use these for ndisc */
915 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
918 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
919 rt->rt6i_dst.plen = 128;
922 spin_lock_bh(&ndisc_lock);
923 rt->u.dst.next = ndisc_dst_gc_list;
924 ndisc_dst_gc_list = &rt->u.dst;
925 spin_unlock_bh(&ndisc_lock);
927 fib6_force_start_gc();
930 return (struct dst_entry *)rt;
933 int ndisc_dst_gc(int *more)
935 struct dst_entry *dst, *next, **pprev;
941 spin_lock_bh(&ndisc_lock);
942 pprev = &ndisc_dst_gc_list;
944 while ((dst = *pprev) != NULL) {
945 if (!atomic_read(&dst->__refcnt)) {
955 spin_unlock_bh(&ndisc_lock);
960 static int ip6_dst_gc(void)
962 static unsigned expire = 30*HZ;
963 static unsigned long last_gc;
964 unsigned long now = jiffies;
966 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
967 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
973 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
974 expire = ip6_rt_gc_timeout>>1;
977 expire -= expire>>ip6_rt_gc_elasticity;
978 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
981 /* Clean host part of a prefix. Not necessary in radix tree,
982 but results in cleaner routing tables.
984 Remove it only when all the things will work!
987 static int ipv6_get_mtu(struct net_device *dev)
989 int mtu = IPV6_MIN_MTU;
990 struct inet6_dev *idev;
992 idev = in6_dev_get(dev);
994 mtu = idev->cnf.mtu6;
1000 int ipv6_get_hoplimit(struct net_device *dev)
1002 int hoplimit = ipv6_devconf.hop_limit;
1003 struct inet6_dev *idev;
1005 idev = in6_dev_get(dev);
1007 hoplimit = idev->cnf.hop_limit;
1017 int ip6_route_add(struct fib6_config *cfg)
1020 struct rt6_info *rt = NULL;
1021 struct net_device *dev = NULL;
1022 struct inet6_dev *idev = NULL;
1023 struct fib6_table *table;
1026 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1028 #ifndef CONFIG_IPV6_SUBTREES
1029 if (cfg->fc_src_len)
1032 if (cfg->fc_ifindex) {
1034 dev = dev_get_by_index(cfg->fc_ifindex);
1037 idev = in6_dev_get(dev);
1042 if (cfg->fc_metric == 0)
1043 cfg->fc_metric = IP6_RT_PRIO_USER;
1045 table = fib6_new_table(cfg->fc_table);
1046 if (table == NULL) {
1051 rt = ip6_dst_alloc();
1058 rt->u.dst.obsolete = -1;
1059 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1061 if (cfg->fc_protocol == RTPROT_UNSPEC)
1062 cfg->fc_protocol = RTPROT_BOOT;
1063 rt->rt6i_protocol = cfg->fc_protocol;
1065 addr_type = ipv6_addr_type(&cfg->fc_dst);
1067 if (addr_type & IPV6_ADDR_MULTICAST)
1068 rt->u.dst.input = ip6_mc_input;
1070 rt->u.dst.input = ip6_forward;
1072 rt->u.dst.output = ip6_output;
1074 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1075 rt->rt6i_dst.plen = cfg->fc_dst_len;
1076 if (rt->rt6i_dst.plen == 128)
1077 rt->u.dst.flags = DST_HOST;
1079 #ifdef CONFIG_IPV6_SUBTREES
1080 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1081 rt->rt6i_src.plen = cfg->fc_src_len;
1084 rt->rt6i_metric = cfg->fc_metric;
1086 /* We cannot add true routes via loopback here,
1087 they would result in kernel looping; promote them to reject routes
1089 if ((cfg->fc_flags & RTF_REJECT) ||
1090 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1091 /* hold loopback dev/idev if we haven't done so. */
1092 if (dev != &loopback_dev) {
1097 dev = &loopback_dev;
1099 idev = in6_dev_get(dev);
1105 rt->u.dst.output = ip6_pkt_discard_out;
1106 rt->u.dst.input = ip6_pkt_discard;
1107 rt->u.dst.error = -ENETUNREACH;
1108 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1112 if (cfg->fc_flags & RTF_GATEWAY) {
1113 struct in6_addr *gw_addr;
1116 gw_addr = &cfg->fc_gateway;
1117 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1118 gwa_type = ipv6_addr_type(gw_addr);
1120 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1121 struct rt6_info *grt;
1123 /* IPv6 strictly inhibits using not link-local
1124 addresses as nexthop address.
1125 Otherwise, router will not able to send redirects.
1126 It is very good, but in some (rare!) circumstances
1127 (SIT, PtP, NBMA NOARP links) it is handy to allow
1128 some exceptions. --ANK
1131 if (!(gwa_type&IPV6_ADDR_UNICAST))
1134 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1136 err = -EHOSTUNREACH;
1140 if (dev != grt->rt6i_dev) {
1141 dst_release(&grt->u.dst);
1145 dev = grt->rt6i_dev;
1146 idev = grt->rt6i_idev;
1148 in6_dev_hold(grt->rt6i_idev);
1150 if (!(grt->rt6i_flags&RTF_GATEWAY))
1152 dst_release(&grt->u.dst);
1158 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1166 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1167 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1168 if (IS_ERR(rt->rt6i_nexthop)) {
1169 err = PTR_ERR(rt->rt6i_nexthop);
1170 rt->rt6i_nexthop = NULL;
1175 rt->rt6i_flags = cfg->fc_flags;
1182 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1183 int type = nla->nla_type;
1186 if (type > RTAX_MAX) {
1191 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1196 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1197 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1198 if (!rt->u.dst.metrics[RTAX_MTU-1])
1199 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1200 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1201 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1202 rt->u.dst.dev = dev;
1203 rt->rt6i_idev = idev;
1204 rt->rt6i_table = table;
1205 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1213 dst_free((struct dst_entry *) rt);
1217 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1220 struct fib6_table *table;
1222 if (rt == &ip6_null_entry)
1225 table = rt->rt6i_table;
1226 write_lock_bh(&table->tb6_lock);
1228 err = fib6_del(rt, info);
1229 dst_release(&rt->u.dst);
1231 write_unlock_bh(&table->tb6_lock);
1236 int ip6_del_rt(struct rt6_info *rt)
1238 return __ip6_del_rt(rt, NULL);
1241 static int ip6_route_del(struct fib6_config *cfg)
1243 struct fib6_table *table;
1244 struct fib6_node *fn;
1245 struct rt6_info *rt;
1248 table = fib6_get_table(cfg->fc_table);
1252 read_lock_bh(&table->tb6_lock);
1254 fn = fib6_locate(&table->tb6_root,
1255 &cfg->fc_dst, cfg->fc_dst_len,
1256 &cfg->fc_src, cfg->fc_src_len);
1259 for (rt = fn->leaf; rt; rt = rt->u.next) {
1260 if (cfg->fc_ifindex &&
1261 (rt->rt6i_dev == NULL ||
1262 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1264 if (cfg->fc_flags & RTF_GATEWAY &&
1265 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1267 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1269 dst_hold(&rt->u.dst);
1270 read_unlock_bh(&table->tb6_lock);
1272 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1275 read_unlock_bh(&table->tb6_lock);
1283 struct ip6rd_flowi {
1285 struct in6_addr gateway;
1288 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1292 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1293 struct rt6_info *rt;
1294 struct fib6_node *fn;
1297 * Get the "current" route for this destination and
1298 * check if the redirect has come from approriate router.
1300 * RFC 2461 specifies that redirects should only be
1301 * accepted if they come from the nexthop to the target.
1302 * Due to the way the routes are chosen, this notion
1303 * is a bit fuzzy and one might need to check all possible
1307 read_lock_bh(&table->tb6_lock);
1308 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1310 for (rt = fn->leaf; rt; rt = rt->u.next) {
1312 * Current route is on-link; redirect is always invalid.
1314 * Seems, previous statement is not true. It could
1315 * be node, which looks for us as on-link (f.e. proxy ndisc)
1316 * But then router serving it might decide, that we should
1317 * know truth 8)8) --ANK (980726).
1319 if (rt6_check_expired(rt))
1321 if (!(rt->rt6i_flags & RTF_GATEWAY))
1323 if (fl->oif != rt->rt6i_dev->ifindex)
1325 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1331 rt = &ip6_null_entry;
1332 BACKTRACK(&fl->fl6_src);
1334 dst_hold(&rt->u.dst);
1336 read_unlock_bh(&table->tb6_lock);
1341 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1342 struct in6_addr *src,
1343 struct in6_addr *gateway,
1344 struct net_device *dev)
1346 struct ip6rd_flowi rdfl = {
1348 .oif = dev->ifindex,
1356 .gateway = *gateway,
1358 int flags = rt6_need_strict(dest) ? RT6_LOOKUP_F_IFACE : 0;
1360 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1363 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1364 struct in6_addr *saddr,
1365 struct neighbour *neigh, u8 *lladdr, int on_link)
1367 struct rt6_info *rt, *nrt = NULL;
1368 struct netevent_redirect netevent;
1370 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1372 if (rt == &ip6_null_entry) {
1373 if (net_ratelimit())
1374 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1375 "for redirect target\n");
1380 * We have finally decided to accept it.
1383 neigh_update(neigh, lladdr, NUD_STALE,
1384 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1385 NEIGH_UPDATE_F_OVERRIDE|
1386 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1387 NEIGH_UPDATE_F_ISROUTER))
1391 * Redirect received -> path was valid.
1392 * Look, redirects are sent only in response to data packets,
1393 * so that this nexthop apparently is reachable. --ANK
1395 dst_confirm(&rt->u.dst);
1397 /* Duplicate redirect: silently ignore. */
1398 if (neigh == rt->u.dst.neighbour)
1401 nrt = ip6_rt_copy(rt);
1405 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1407 nrt->rt6i_flags &= ~RTF_GATEWAY;
1409 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1410 nrt->rt6i_dst.plen = 128;
1411 nrt->u.dst.flags |= DST_HOST;
1413 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1414 nrt->rt6i_nexthop = neigh_clone(neigh);
1415 /* Reset pmtu, it may be better */
1416 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1417 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1419 if (ip6_ins_rt(nrt))
1422 netevent.old = &rt->u.dst;
1423 netevent.new = &nrt->u.dst;
1424 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1426 if (rt->rt6i_flags&RTF_CACHE) {
1432 dst_release(&rt->u.dst);
1437 * Handle ICMP "packet too big" messages
1438 * i.e. Path MTU discovery
1441 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1442 struct net_device *dev, u32 pmtu)
1444 struct rt6_info *rt, *nrt;
1447 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1451 if (pmtu >= dst_mtu(&rt->u.dst))
1454 if (pmtu < IPV6_MIN_MTU) {
1456 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1457 * MTU (1280) and a fragment header should always be included
1458 * after a node receiving Too Big message reporting PMTU is
1459 * less than the IPv6 Minimum Link MTU.
1461 pmtu = IPV6_MIN_MTU;
1465 /* New mtu received -> path was valid.
1466 They are sent only in response to data packets,
1467 so that this nexthop apparently is reachable. --ANK
1469 dst_confirm(&rt->u.dst);
1471 /* Host route. If it is static, it would be better
1472 not to override it, but add new one, so that
1473 when cache entry will expire old pmtu
1474 would return automatically.
1476 if (rt->rt6i_flags & RTF_CACHE) {
1477 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1479 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1480 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1481 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1486 Two cases are possible:
1487 1. It is connected route. Action: COW
1488 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1490 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1491 nrt = rt6_alloc_cow(rt, daddr, saddr);
1493 nrt = rt6_alloc_clone(rt, daddr);
1496 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1498 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1500 /* According to RFC 1981, detecting PMTU increase shouldn't be
1501 * happened within 5 mins, the recommended timer is 10 mins.
1502 * Here this route expiration time is set to ip6_rt_mtu_expires
1503 * which is 10 mins. After 10 mins the decreased pmtu is expired
1504 * and detecting PMTU increase will be automatically happened.
1506 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1507 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1512 dst_release(&rt->u.dst);
1516 * Misc support functions
1519 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1521 struct rt6_info *rt = ip6_dst_alloc();
1524 rt->u.dst.input = ort->u.dst.input;
1525 rt->u.dst.output = ort->u.dst.output;
1527 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1528 rt->u.dst.dev = ort->u.dst.dev;
1530 dev_hold(rt->u.dst.dev);
1531 rt->rt6i_idev = ort->rt6i_idev;
1533 in6_dev_hold(rt->rt6i_idev);
1534 rt->u.dst.lastuse = jiffies;
1535 rt->rt6i_expires = 0;
1537 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1538 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1539 rt->rt6i_metric = 0;
1541 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1542 #ifdef CONFIG_IPV6_SUBTREES
1543 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1545 rt->rt6i_table = ort->rt6i_table;
1550 #ifdef CONFIG_IPV6_ROUTE_INFO
1551 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1552 struct in6_addr *gwaddr, int ifindex)
1554 struct fib6_node *fn;
1555 struct rt6_info *rt = NULL;
1556 struct fib6_table *table;
1558 table = fib6_get_table(RT6_TABLE_INFO);
1562 write_lock_bh(&table->tb6_lock);
1563 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1567 for (rt = fn->leaf; rt; rt = rt->u.next) {
1568 if (rt->rt6i_dev->ifindex != ifindex)
1570 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1572 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1574 dst_hold(&rt->u.dst);
1578 write_unlock_bh(&table->tb6_lock);
1582 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1583 struct in6_addr *gwaddr, int ifindex,
1586 struct fib6_config cfg = {
1587 .fc_table = RT6_TABLE_INFO,
1589 .fc_ifindex = ifindex,
1590 .fc_dst_len = prefixlen,
1591 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1592 RTF_UP | RTF_PREF(pref),
1595 ipv6_addr_copy(&cfg.fc_dst, prefix);
1596 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1598 /* We should treat it as a default route if prefix length is 0. */
1600 cfg.fc_flags |= RTF_DEFAULT;
1602 ip6_route_add(&cfg);
1604 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1608 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1610 struct rt6_info *rt;
1611 struct fib6_table *table;
1613 table = fib6_get_table(RT6_TABLE_DFLT);
1617 write_lock_bh(&table->tb6_lock);
1618 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1619 if (dev == rt->rt6i_dev &&
1620 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1621 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1625 dst_hold(&rt->u.dst);
1626 write_unlock_bh(&table->tb6_lock);
1630 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1631 struct net_device *dev,
1634 struct fib6_config cfg = {
1635 .fc_table = RT6_TABLE_DFLT,
1637 .fc_ifindex = dev->ifindex,
1638 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1639 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1642 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1644 ip6_route_add(&cfg);
1646 return rt6_get_dflt_router(gwaddr, dev);
1649 void rt6_purge_dflt_routers(void)
1651 struct rt6_info *rt;
1652 struct fib6_table *table;
1654 /* NOTE: Keep consistent with rt6_get_dflt_router */
1655 table = fib6_get_table(RT6_TABLE_DFLT);
1660 read_lock_bh(&table->tb6_lock);
1661 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1662 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1663 dst_hold(&rt->u.dst);
1664 read_unlock_bh(&table->tb6_lock);
1669 read_unlock_bh(&table->tb6_lock);
1672 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1673 struct fib6_config *cfg)
1675 memset(cfg, 0, sizeof(*cfg));
1677 cfg->fc_table = RT6_TABLE_MAIN;
1678 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1679 cfg->fc_metric = rtmsg->rtmsg_metric;
1680 cfg->fc_expires = rtmsg->rtmsg_info;
1681 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1682 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1683 cfg->fc_flags = rtmsg->rtmsg_flags;
1685 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1686 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1687 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1690 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1692 struct fib6_config cfg;
1693 struct in6_rtmsg rtmsg;
1697 case SIOCADDRT: /* Add a route */
1698 case SIOCDELRT: /* Delete a route */
1699 if (!capable(CAP_NET_ADMIN))
1701 err = copy_from_user(&rtmsg, arg,
1702 sizeof(struct in6_rtmsg));
1706 rtmsg_to_fib6_config(&rtmsg, &cfg);
1711 err = ip6_route_add(&cfg);
1714 err = ip6_route_del(&cfg);
1728 * Drop the packet on the floor
1731 static int ip6_pkt_discard(struct sk_buff *skb)
1733 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1734 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1735 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1737 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1738 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1743 static int ip6_pkt_discard_out(struct sk_buff *skb)
1745 skb->dev = skb->dst->dev;
1746 return ip6_pkt_discard(skb);
1750 * Allocate a dst for local (unicast / anycast) address.
1753 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1754 const struct in6_addr *addr,
1757 struct rt6_info *rt = ip6_dst_alloc();
1760 return ERR_PTR(-ENOMEM);
1762 dev_hold(&loopback_dev);
1765 rt->u.dst.flags = DST_HOST;
1766 rt->u.dst.input = ip6_input;
1767 rt->u.dst.output = ip6_output;
1768 rt->rt6i_dev = &loopback_dev;
1769 rt->rt6i_idev = idev;
1770 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1771 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1772 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1773 rt->u.dst.obsolete = -1;
1775 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1777 rt->rt6i_flags |= RTF_ANYCAST;
1779 rt->rt6i_flags |= RTF_LOCAL;
1780 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1781 if (rt->rt6i_nexthop == NULL) {
1782 dst_free((struct dst_entry *) rt);
1783 return ERR_PTR(-ENOMEM);
1786 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1787 rt->rt6i_dst.plen = 128;
1788 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1790 atomic_set(&rt->u.dst.__refcnt, 1);
1795 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1797 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1798 rt != &ip6_null_entry) {
1799 RT6_TRACE("deleted by ifdown %p\n", rt);
1805 void rt6_ifdown(struct net_device *dev)
1807 fib6_clean_all(fib6_ifdown, 0, dev);
1810 struct rt6_mtu_change_arg
1812 struct net_device *dev;
1816 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1818 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1819 struct inet6_dev *idev;
1821 /* In IPv6 pmtu discovery is not optional,
1822 so that RTAX_MTU lock cannot disable it.
1823 We still use this lock to block changes
1824 caused by addrconf/ndisc.
1827 idev = __in6_dev_get(arg->dev);
1831 /* For administrative MTU increase, there is no way to discover
1832 IPv6 PMTU increase, so PMTU increase should be updated here.
1833 Since RFC 1981 doesn't include administrative MTU increase
1834 update PMTU increase is a MUST. (i.e. jumbo frame)
1837 If new MTU is less than route PMTU, this new MTU will be the
1838 lowest MTU in the path, update the route PMTU to reflect PMTU
1839 decreases; if new MTU is greater than route PMTU, and the
1840 old MTU is the lowest MTU in the path, update the route PMTU
1841 to reflect the increase. In this case if the other nodes' MTU
1842 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1845 if (rt->rt6i_dev == arg->dev &&
1846 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1847 (dst_mtu(&rt->u.dst) > arg->mtu ||
1848 (dst_mtu(&rt->u.dst) < arg->mtu &&
1849 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1850 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1851 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1855 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1857 struct rt6_mtu_change_arg arg = {
1862 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1865 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1866 [RTA_GATEWAY] = { .minlen = sizeof(struct in6_addr) },
1867 [RTA_OIF] = { .type = NLA_U32 },
1868 [RTA_IIF] = { .type = NLA_U32 },
1869 [RTA_PRIORITY] = { .type = NLA_U32 },
1870 [RTA_METRICS] = { .type = NLA_NESTED },
1873 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1874 struct fib6_config *cfg)
1877 struct nlattr *tb[RTA_MAX+1];
1880 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1885 rtm = nlmsg_data(nlh);
1886 memset(cfg, 0, sizeof(*cfg));
1888 cfg->fc_table = rtm->rtm_table;
1889 cfg->fc_dst_len = rtm->rtm_dst_len;
1890 cfg->fc_src_len = rtm->rtm_src_len;
1891 cfg->fc_flags = RTF_UP;
1892 cfg->fc_protocol = rtm->rtm_protocol;
1894 if (rtm->rtm_type == RTN_UNREACHABLE)
1895 cfg->fc_flags |= RTF_REJECT;
1897 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1898 cfg->fc_nlinfo.nlh = nlh;
1900 if (tb[RTA_GATEWAY]) {
1901 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1902 cfg->fc_flags |= RTF_GATEWAY;
1906 int plen = (rtm->rtm_dst_len + 7) >> 3;
1908 if (nla_len(tb[RTA_DST]) < plen)
1911 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1915 int plen = (rtm->rtm_src_len + 7) >> 3;
1917 if (nla_len(tb[RTA_SRC]) < plen)
1920 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1924 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1926 if (tb[RTA_PRIORITY])
1927 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1929 if (tb[RTA_METRICS]) {
1930 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1931 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1935 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1942 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1944 struct fib6_config cfg;
1947 err = rtm_to_fib6_config(skb, nlh, &cfg);
1951 return ip6_route_del(&cfg);
1954 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1956 struct fib6_config cfg;
1959 err = rtm_to_fib6_config(skb, nlh, &cfg);
1963 return ip6_route_add(&cfg);
1966 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1967 struct in6_addr *dst, struct in6_addr *src,
1968 int iif, int type, u32 pid, u32 seq,
1969 int prefix, unsigned int flags)
1972 struct nlmsghdr *nlh;
1973 struct rta_cacheinfo ci;
1976 if (prefix) { /* user wants prefix routes only */
1977 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1978 /* success since this is not a prefix route */
1983 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1987 rtm = nlmsg_data(nlh);
1988 rtm->rtm_family = AF_INET6;
1989 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1990 rtm->rtm_src_len = rt->rt6i_src.plen;
1993 table = rt->rt6i_table->tb6_id;
1995 table = RT6_TABLE_UNSPEC;
1996 rtm->rtm_table = table;
1997 NLA_PUT_U32(skb, RTA_TABLE, table);
1998 if (rt->rt6i_flags&RTF_REJECT)
1999 rtm->rtm_type = RTN_UNREACHABLE;
2000 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2001 rtm->rtm_type = RTN_LOCAL;
2003 rtm->rtm_type = RTN_UNICAST;
2005 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2006 rtm->rtm_protocol = rt->rt6i_protocol;
2007 if (rt->rt6i_flags&RTF_DYNAMIC)
2008 rtm->rtm_protocol = RTPROT_REDIRECT;
2009 else if (rt->rt6i_flags & RTF_ADDRCONF)
2010 rtm->rtm_protocol = RTPROT_KERNEL;
2011 else if (rt->rt6i_flags&RTF_DEFAULT)
2012 rtm->rtm_protocol = RTPROT_RA;
2014 if (rt->rt6i_flags&RTF_CACHE)
2015 rtm->rtm_flags |= RTM_F_CLONED;
2018 NLA_PUT(skb, RTA_DST, 16, dst);
2019 rtm->rtm_dst_len = 128;
2020 } else if (rtm->rtm_dst_len)
2021 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2022 #ifdef CONFIG_IPV6_SUBTREES
2024 NLA_PUT(skb, RTA_SRC, 16, src);
2025 rtm->rtm_src_len = 128;
2026 } else if (rtm->rtm_src_len)
2027 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2030 NLA_PUT_U32(skb, RTA_IIF, iif);
2032 struct in6_addr saddr_buf;
2033 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2034 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2037 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2038 goto nla_put_failure;
2040 if (rt->u.dst.neighbour)
2041 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2044 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2046 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2047 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2048 if (rt->rt6i_expires)
2049 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2052 ci.rta_used = rt->u.dst.__use;
2053 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2054 ci.rta_error = rt->u.dst.error;
2058 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2060 return nlmsg_end(skb, nlh);
2063 return nlmsg_cancel(skb, nlh);
2066 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2068 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2071 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2072 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2073 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2077 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2078 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2079 prefix, NLM_F_MULTI);
2082 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2084 struct nlattr *tb[RTA_MAX+1];
2085 struct rt6_info *rt;
2086 struct sk_buff *skb;
2091 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2096 memset(&fl, 0, sizeof(fl));
2099 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2102 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2106 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2109 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2113 iif = nla_get_u32(tb[RTA_IIF]);
2116 fl.oif = nla_get_u32(tb[RTA_OIF]);
2119 struct net_device *dev;
2120 dev = __dev_get_by_index(iif);
2127 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2133 /* Reserve room for dummy headers, this skb can pass
2134 through good chunk of routing engine.
2136 skb->mac.raw = skb->data;
2137 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2139 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2140 skb->dst = &rt->u.dst;
2142 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2143 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2144 nlh->nlmsg_seq, 0, 0);
2150 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2155 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2157 struct sk_buff *skb;
2158 u32 pid = 0, seq = 0;
2159 struct nlmsghdr *nlh = NULL;
2160 int payload = sizeof(struct rtmsg) + 256;
2167 seq = nlh->nlmsg_seq;
2170 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2174 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2180 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2183 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2190 #ifdef CONFIG_PROC_FS
2192 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2203 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2205 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2208 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2213 if (arg->len >= arg->length)
2216 for (i=0; i<16; i++) {
2217 sprintf(arg->buffer + arg->len, "%02x",
2218 rt->rt6i_dst.addr.s6_addr[i]);
2221 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2224 #ifdef CONFIG_IPV6_SUBTREES
2225 for (i=0; i<16; i++) {
2226 sprintf(arg->buffer + arg->len, "%02x",
2227 rt->rt6i_src.addr.s6_addr[i]);
2230 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2233 sprintf(arg->buffer + arg->len,
2234 "00000000000000000000000000000000 00 ");
2238 if (rt->rt6i_nexthop) {
2239 for (i=0; i<16; i++) {
2240 sprintf(arg->buffer + arg->len, "%02x",
2241 rt->rt6i_nexthop->primary_key[i]);
2245 sprintf(arg->buffer + arg->len,
2246 "00000000000000000000000000000000");
2249 arg->len += sprintf(arg->buffer + arg->len,
2250 " %08x %08x %08x %08x %8s\n",
2251 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2252 rt->u.dst.__use, rt->rt6i_flags,
2253 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2257 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2259 struct rt6_proc_arg arg = {
2265 fib6_clean_all(rt6_info_route, 0, &arg);
2269 *start += offset % RT6_INFO_LEN;
2271 arg.len -= offset % RT6_INFO_LEN;
2273 if (arg.len > length)
2281 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2283 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2284 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2285 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2286 rt6_stats.fib_rt_cache,
2287 atomic_read(&ip6_dst_ops.entries),
2288 rt6_stats.fib_discarded_routes);
2293 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2295 return single_open(file, rt6_stats_seq_show, NULL);
2298 static struct file_operations rt6_stats_seq_fops = {
2299 .owner = THIS_MODULE,
2300 .open = rt6_stats_seq_open,
2302 .llseek = seq_lseek,
2303 .release = single_release,
2305 #endif /* CONFIG_PROC_FS */
2307 #ifdef CONFIG_SYSCTL
2309 static int flush_delay;
2312 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2313 void __user *buffer, size_t *lenp, loff_t *ppos)
2316 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2317 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2323 ctl_table ipv6_route_table[] = {
2325 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2326 .procname = "flush",
2327 .data = &flush_delay,
2328 .maxlen = sizeof(int),
2330 .proc_handler = &ipv6_sysctl_rtcache_flush
2333 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2334 .procname = "gc_thresh",
2335 .data = &ip6_dst_ops.gc_thresh,
2336 .maxlen = sizeof(int),
2338 .proc_handler = &proc_dointvec,
2341 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2342 .procname = "max_size",
2343 .data = &ip6_rt_max_size,
2344 .maxlen = sizeof(int),
2346 .proc_handler = &proc_dointvec,
2349 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2350 .procname = "gc_min_interval",
2351 .data = &ip6_rt_gc_min_interval,
2352 .maxlen = sizeof(int),
2354 .proc_handler = &proc_dointvec_jiffies,
2355 .strategy = &sysctl_jiffies,
2358 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2359 .procname = "gc_timeout",
2360 .data = &ip6_rt_gc_timeout,
2361 .maxlen = sizeof(int),
2363 .proc_handler = &proc_dointvec_jiffies,
2364 .strategy = &sysctl_jiffies,
2367 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2368 .procname = "gc_interval",
2369 .data = &ip6_rt_gc_interval,
2370 .maxlen = sizeof(int),
2372 .proc_handler = &proc_dointvec_jiffies,
2373 .strategy = &sysctl_jiffies,
2376 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2377 .procname = "gc_elasticity",
2378 .data = &ip6_rt_gc_elasticity,
2379 .maxlen = sizeof(int),
2381 .proc_handler = &proc_dointvec_jiffies,
2382 .strategy = &sysctl_jiffies,
2385 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2386 .procname = "mtu_expires",
2387 .data = &ip6_rt_mtu_expires,
2388 .maxlen = sizeof(int),
2390 .proc_handler = &proc_dointvec_jiffies,
2391 .strategy = &sysctl_jiffies,
2394 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2395 .procname = "min_adv_mss",
2396 .data = &ip6_rt_min_advmss,
2397 .maxlen = sizeof(int),
2399 .proc_handler = &proc_dointvec_jiffies,
2400 .strategy = &sysctl_jiffies,
2403 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2404 .procname = "gc_min_interval_ms",
2405 .data = &ip6_rt_gc_min_interval,
2406 .maxlen = sizeof(int),
2408 .proc_handler = &proc_dointvec_ms_jiffies,
2409 .strategy = &sysctl_ms_jiffies,
2416 void __init ip6_route_init(void)
2418 struct proc_dir_entry *p;
2420 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2421 sizeof(struct rt6_info),
2422 0, SLAB_HWCACHE_ALIGN,
2424 if (!ip6_dst_ops.kmem_cachep)
2425 panic("cannot create ip6_dst_cache");
2428 #ifdef CONFIG_PROC_FS
2429 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2431 p->owner = THIS_MODULE;
2433 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2438 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2443 void ip6_route_cleanup(void)
2445 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2446 fib6_rules_cleanup();
2448 #ifdef CONFIG_PROC_FS
2449 proc_net_remove("ipv6_route");
2450 proc_net_remove("rt6_stats");
2457 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);