]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv6/route.c
[IPV6]: Multiple Routing Tables
[linux-2.6-omap-h63xx.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 /* allocate dst with ip6_dst_ops */
144 static __inline__ struct rt6_info *ip6_dst_alloc(void)
145 {
146         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
147 }
148
149 static void ip6_dst_destroy(struct dst_entry *dst)
150 {
151         struct rt6_info *rt = (struct rt6_info *)dst;
152         struct inet6_dev *idev = rt->rt6i_idev;
153
154         if (idev != NULL) {
155                 rt->rt6i_idev = NULL;
156                 in6_dev_put(idev);
157         }       
158 }
159
160 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
161                            int how)
162 {
163         struct rt6_info *rt = (struct rt6_info *)dst;
164         struct inet6_dev *idev = rt->rt6i_idev;
165
166         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
167                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
168                 if (loopback_idev != NULL) {
169                         rt->rt6i_idev = loopback_idev;
170                         in6_dev_put(idev);
171                 }
172         }
173 }
174
175 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
176 {
177         return (rt->rt6i_flags & RTF_EXPIRES &&
178                 time_after(jiffies, rt->rt6i_expires));
179 }
180
181 static inline int rt6_need_strict(struct in6_addr *daddr)
182 {
183         return (ipv6_addr_type(daddr) &
184                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
185 }
186
187 /*
188  *      Route lookup. Any table->tb6_lock is implied.
189  */
190
191 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
192                                                     int oif,
193                                                     int strict)
194 {
195         struct rt6_info *local = NULL;
196         struct rt6_info *sprt;
197
198         if (oif) {
199                 for (sprt = rt; sprt; sprt = sprt->u.next) {
200                         struct net_device *dev = sprt->rt6i_dev;
201                         if (dev->ifindex == oif)
202                                 return sprt;
203                         if (dev->flags & IFF_LOOPBACK) {
204                                 if (sprt->rt6i_idev == NULL ||
205                                     sprt->rt6i_idev->dev->ifindex != oif) {
206                                         if (strict && oif)
207                                                 continue;
208                                         if (local && (!oif || 
209                                                       local->rt6i_idev->dev->ifindex == oif))
210                                                 continue;
211                                 }
212                                 local = sprt;
213                         }
214                 }
215
216                 if (local)
217                         return local;
218
219                 if (strict)
220                         return &ip6_null_entry;
221         }
222         return rt;
223 }
224
225 #ifdef CONFIG_IPV6_ROUTER_PREF
226 static void rt6_probe(struct rt6_info *rt)
227 {
228         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
229         /*
230          * Okay, this does not seem to be appropriate
231          * for now, however, we need to check if it
232          * is really so; aka Router Reachability Probing.
233          *
234          * Router Reachability Probe MUST be rate-limited
235          * to no more than one per minute.
236          */
237         if (!neigh || (neigh->nud_state & NUD_VALID))
238                 return;
239         read_lock_bh(&neigh->lock);
240         if (!(neigh->nud_state & NUD_VALID) &&
241             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
242                 struct in6_addr mcaddr;
243                 struct in6_addr *target;
244
245                 neigh->updated = jiffies;
246                 read_unlock_bh(&neigh->lock);
247
248                 target = (struct in6_addr *)&neigh->primary_key;
249                 addrconf_addr_solict_mult(target, &mcaddr);
250                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
251         } else
252                 read_unlock_bh(&neigh->lock);
253 }
254 #else
255 static inline void rt6_probe(struct rt6_info *rt)
256 {
257         return;
258 }
259 #endif
260
261 /*
262  * Default Router Selection (RFC 2461 6.3.6)
263  */
264 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
265 {
266         struct net_device *dev = rt->rt6i_dev;
267         if (!oif || dev->ifindex == oif)
268                 return 2;
269         if ((dev->flags & IFF_LOOPBACK) &&
270             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
271                 return 1;
272         return 0;
273 }
274
275 static int inline rt6_check_neigh(struct rt6_info *rt)
276 {
277         struct neighbour *neigh = rt->rt6i_nexthop;
278         int m = 0;
279         if (rt->rt6i_flags & RTF_NONEXTHOP ||
280             !(rt->rt6i_flags & RTF_GATEWAY))
281                 m = 1;
282         else if (neigh) {
283                 read_lock_bh(&neigh->lock);
284                 if (neigh->nud_state & NUD_VALID)
285                         m = 2;
286                 read_unlock_bh(&neigh->lock);
287         }
288         return m;
289 }
290
291 static int rt6_score_route(struct rt6_info *rt, int oif,
292                            int strict)
293 {
294         int m, n;
295                 
296         m = rt6_check_dev(rt, oif);
297         if (!m && (strict & RT6_SELECT_F_IFACE))
298                 return -1;
299 #ifdef CONFIG_IPV6_ROUTER_PREF
300         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
301 #endif
302         n = rt6_check_neigh(rt);
303         if (n > 1)
304                 m |= 16;
305         else if (!n && strict & RT6_SELECT_F_REACHABLE)
306                 return -1;
307         return m;
308 }
309
310 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
311                                    int strict)
312 {
313         struct rt6_info *match = NULL, *last = NULL;
314         struct rt6_info *rt, *rt0 = *head;
315         u32 metric;
316         int mpri = -1;
317
318         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
319                   __FUNCTION__, head, head ? *head : NULL, oif);
320
321         for (rt = rt0, metric = rt0->rt6i_metric;
322              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
323              rt = rt->u.next) {
324                 int m;
325
326                 if (rt6_check_expired(rt))
327                         continue;
328
329                 last = rt;
330
331                 m = rt6_score_route(rt, oif, strict);
332                 if (m < 0)
333                         continue;
334
335                 if (m > mpri) {
336                         rt6_probe(match);
337                         match = rt;
338                         mpri = m;
339                 } else {
340                         rt6_probe(rt);
341                 }
342         }
343
344         if (!match &&
345             (strict & RT6_SELECT_F_REACHABLE) &&
346             last && last != rt0) {
347                 /* no entries matched; do round-robin */
348                 static DEFINE_SPINLOCK(lock);
349                 spin_lock(&lock);
350                 *head = rt0->u.next;
351                 rt0->u.next = last->u.next;
352                 last->u.next = rt0;
353                 spin_unlock(&lock);
354         }
355
356         RT6_TRACE("%s() => %p, score=%d\n",
357                   __FUNCTION__, match, mpri);
358
359         return (match ? match : &ip6_null_entry);
360 }
361
362 #ifdef CONFIG_IPV6_ROUTE_INFO
363 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
364                   struct in6_addr *gwaddr)
365 {
366         struct route_info *rinfo = (struct route_info *) opt;
367         struct in6_addr prefix_buf, *prefix;
368         unsigned int pref;
369         u32 lifetime;
370         struct rt6_info *rt;
371
372         if (len < sizeof(struct route_info)) {
373                 return -EINVAL;
374         }
375
376         /* Sanity check for prefix_len and length */
377         if (rinfo->length > 3) {
378                 return -EINVAL;
379         } else if (rinfo->prefix_len > 128) {
380                 return -EINVAL;
381         } else if (rinfo->prefix_len > 64) {
382                 if (rinfo->length < 2) {
383                         return -EINVAL;
384                 }
385         } else if (rinfo->prefix_len > 0) {
386                 if (rinfo->length < 1) {
387                         return -EINVAL;
388                 }
389         }
390
391         pref = rinfo->route_pref;
392         if (pref == ICMPV6_ROUTER_PREF_INVALID)
393                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
394
395         lifetime = htonl(rinfo->lifetime);
396         if (lifetime == 0xffffffff) {
397                 /* infinity */
398         } else if (lifetime > 0x7fffffff/HZ) {
399                 /* Avoid arithmetic overflow */
400                 lifetime = 0x7fffffff/HZ - 1;
401         }
402
403         if (rinfo->length == 3)
404                 prefix = (struct in6_addr *)rinfo->prefix;
405         else {
406                 /* this function is safe */
407                 ipv6_addr_prefix(&prefix_buf,
408                                  (struct in6_addr *)rinfo->prefix,
409                                  rinfo->prefix_len);
410                 prefix = &prefix_buf;
411         }
412
413         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
414
415         if (rt && !lifetime) {
416                 ip6_del_rt(rt, NULL, NULL, NULL);
417                 rt = NULL;
418         }
419
420         if (!rt && lifetime)
421                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
422                                         pref);
423         else if (rt)
424                 rt->rt6i_flags = RTF_ROUTEINFO |
425                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
426
427         if (rt) {
428                 if (lifetime == 0xffffffff) {
429                         rt->rt6i_flags &= ~RTF_EXPIRES;
430                 } else {
431                         rt->rt6i_expires = jiffies + HZ * lifetime;
432                         rt->rt6i_flags |= RTF_EXPIRES;
433                 }
434                 dst_release(&rt->u.dst);
435         }
436         return 0;
437 }
438 #endif
439
440 #define BACKTRACK() \
441 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
442         while ((fn = fn->parent) != NULL) { \
443                 if (fn->fn_flags & RTN_TL_ROOT) { \
444                         dst_hold(&rt->u.dst); \
445                         goto out; \
446                 } \
447                 if (fn->fn_flags & RTN_RTINFO) \
448                         goto restart; \
449         } \
450 }
451
452 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
453                                              struct flowi *fl, int flags)
454 {
455         struct fib6_node *fn;
456         struct rt6_info *rt;
457
458         read_lock_bh(&table->tb6_lock);
459         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
460 restart:
461         rt = fn->leaf;
462         rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
463         BACKTRACK();
464         dst_hold(&rt->u.dst);
465 out:
466         read_unlock_bh(&table->tb6_lock);
467
468         rt->u.dst.lastuse = jiffies;
469         rt->u.dst.__use++;
470
471         return rt;
472
473 }
474
475 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
476                             int oif, int strict)
477 {
478         struct flowi fl = {
479                 .oif = oif,
480                 .nl_u = {
481                         .ip6_u = {
482                                 .daddr = *daddr,
483                                 /* TODO: saddr */
484                         },
485                 },
486         };
487         struct dst_entry *dst;
488         int flags = strict ? RT6_F_STRICT : 0;
489
490         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
491         if (dst->error == 0)
492                 return (struct rt6_info *) dst;
493
494         dst_release(dst);
495
496         return NULL;
497 }
498
499 /* ip6_ins_rt is called with FREE table->tb6_lock.
500    It takes new route entry, the addition fails by any reason the
501    route is freed. In any case, if caller does not hold it, it may
502    be destroyed.
503  */
504
505 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
506                 void *_rtattr, struct netlink_skb_parms *req)
507 {
508         int err;
509         struct fib6_table *table;
510
511         table = rt->rt6i_table;
512         write_lock_bh(&table->tb6_lock);
513         err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
514         write_unlock_bh(&table->tb6_lock);
515
516         return err;
517 }
518
519 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
520                                       struct in6_addr *saddr)
521 {
522         struct rt6_info *rt;
523
524         /*
525          *      Clone the route.
526          */
527
528         rt = ip6_rt_copy(ort);
529
530         if (rt) {
531                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
532                         if (rt->rt6i_dst.plen != 128 &&
533                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
534                                 rt->rt6i_flags |= RTF_ANYCAST;
535                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
536                 }
537
538                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
539                 rt->rt6i_dst.plen = 128;
540                 rt->rt6i_flags |= RTF_CACHE;
541                 rt->u.dst.flags |= DST_HOST;
542
543 #ifdef CONFIG_IPV6_SUBTREES
544                 if (rt->rt6i_src.plen && saddr) {
545                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
546                         rt->rt6i_src.plen = 128;
547                 }
548 #endif
549
550                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
551
552         }
553
554         return rt;
555 }
556
557 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
558 {
559         struct rt6_info *rt = ip6_rt_copy(ort);
560         if (rt) {
561                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
562                 rt->rt6i_dst.plen = 128;
563                 rt->rt6i_flags |= RTF_CACHE;
564                 if (rt->rt6i_flags & RTF_REJECT)
565                         rt->u.dst.error = ort->u.dst.error;
566                 rt->u.dst.flags |= DST_HOST;
567                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
568         }
569         return rt;
570 }
571
572 struct rt6_info *ip6_pol_route_input(struct fib6_table *table, struct flowi *fl,
573                                      int flags)
574 {
575         struct fib6_node *fn;
576         struct rt6_info *rt, *nrt;
577         int strict = 0;
578         int attempts = 3;
579         int err;
580         int reachable = RT6_SELECT_F_REACHABLE;
581
582         if (flags & RT6_F_STRICT)
583                 strict = RT6_SELECT_F_IFACE;
584
585 relookup:
586         read_lock_bh(&table->tb6_lock);
587
588 restart_2:
589         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
590
591 restart:
592         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
593         BACKTRACK();
594         if (rt == &ip6_null_entry ||
595             rt->rt6i_flags & RTF_CACHE)
596                 goto out;
597
598         dst_hold(&rt->u.dst);
599         read_unlock_bh(&table->tb6_lock);
600
601         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
602                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
603         else {
604 #if CLONE_OFFLINK_ROUTE
605                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
606 #else
607                 goto out2;
608 #endif
609         }
610
611         dst_release(&rt->u.dst);
612         rt = nrt ? : &ip6_null_entry;
613
614         dst_hold(&rt->u.dst);
615         if (nrt) {
616                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
617                 if (!err)
618                         goto out2;
619         }
620
621         if (--attempts <= 0)
622                 goto out2;
623
624         /*
625          * Race condition! In the gap, when table->tb6_lock was
626          * released someone could insert this route.  Relookup.
627          */
628         dst_release(&rt->u.dst);
629         goto relookup;
630
631 out:
632         if (reachable) {
633                 reachable = 0;
634                 goto restart_2;
635         }
636         dst_hold(&rt->u.dst);
637         read_unlock_bh(&table->tb6_lock);
638 out2:
639         rt->u.dst.lastuse = jiffies;
640         rt->u.dst.__use++;
641
642         return rt;
643 }
644
645 void ip6_route_input(struct sk_buff *skb)
646 {
647         struct ipv6hdr *iph = skb->nh.ipv6h;
648         struct flowi fl = {
649                 .iif = skb->dev->ifindex,
650                 .nl_u = {
651                         .ip6_u = {
652                                 .daddr = iph->daddr,
653                                 .saddr = iph->saddr,
654                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
655                         },
656                 },
657                 .proto = iph->nexthdr,
658         };
659         int flags = 0;
660
661         if (rt6_need_strict(&iph->daddr))
662                 flags |= RT6_F_STRICT;
663
664         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
665 }
666
667 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
668                                              struct flowi *fl, int flags)
669 {
670         struct fib6_node *fn;
671         struct rt6_info *rt, *nrt;
672         int strict = 0;
673         int attempts = 3;
674         int err;
675         int reachable = RT6_SELECT_F_REACHABLE;
676
677         if (flags & RT6_F_STRICT)
678                 strict = RT6_SELECT_F_IFACE;
679
680 relookup:
681         read_lock_bh(&table->tb6_lock);
682
683 restart_2:
684         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
685
686 restart:
687         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
688         BACKTRACK();
689         if (rt == &ip6_null_entry ||
690             rt->rt6i_flags & RTF_CACHE)
691                 goto out;
692
693         dst_hold(&rt->u.dst);
694         read_unlock_bh(&table->tb6_lock);
695
696         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
697                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
698         else {
699 #if CLONE_OFFLINK_ROUTE
700                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
701 #else
702                 goto out2;
703 #endif
704         }
705
706         dst_release(&rt->u.dst);
707         rt = nrt ? : &ip6_null_entry;
708
709         dst_hold(&rt->u.dst);
710         if (nrt) {
711                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
712                 if (!err)
713                         goto out2;
714         }
715
716         if (--attempts <= 0)
717                 goto out2;
718
719         /*
720          * Race condition! In the gap, when table->tb6_lock was
721          * released someone could insert this route.  Relookup.
722          */
723         dst_release(&rt->u.dst);
724         goto relookup;
725
726 out:
727         if (reachable) {
728                 reachable = 0;
729                 goto restart_2;
730         }
731         dst_hold(&rt->u.dst);
732         read_unlock_bh(&table->tb6_lock);
733 out2:
734         rt->u.dst.lastuse = jiffies;
735         rt->u.dst.__use++;
736         return rt;
737 }
738
739 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
740 {
741         int flags = 0;
742
743         if (rt6_need_strict(&fl->fl6_dst))
744                 flags |= RT6_F_STRICT;
745
746         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
747 }
748
749
750 /*
751  *      Destination cache support functions
752  */
753
754 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
755 {
756         struct rt6_info *rt;
757
758         rt = (struct rt6_info *) dst;
759
760         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
761                 return dst;
762
763         return NULL;
764 }
765
766 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
767 {
768         struct rt6_info *rt = (struct rt6_info *) dst;
769
770         if (rt) {
771                 if (rt->rt6i_flags & RTF_CACHE)
772                         ip6_del_rt(rt, NULL, NULL, NULL);
773                 else
774                         dst_release(dst);
775         }
776         return NULL;
777 }
778
779 static void ip6_link_failure(struct sk_buff *skb)
780 {
781         struct rt6_info *rt;
782
783         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
784
785         rt = (struct rt6_info *) skb->dst;
786         if (rt) {
787                 if (rt->rt6i_flags&RTF_CACHE) {
788                         dst_set_expires(&rt->u.dst, 0);
789                         rt->rt6i_flags |= RTF_EXPIRES;
790                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
791                         rt->rt6i_node->fn_sernum = -1;
792         }
793 }
794
795 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
796 {
797         struct rt6_info *rt6 = (struct rt6_info*)dst;
798
799         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
800                 rt6->rt6i_flags |= RTF_MODIFIED;
801                 if (mtu < IPV6_MIN_MTU) {
802                         mtu = IPV6_MIN_MTU;
803                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
804                 }
805                 dst->metrics[RTAX_MTU-1] = mtu;
806                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
807         }
808 }
809
810 static int ipv6_get_mtu(struct net_device *dev);
811
812 static inline unsigned int ipv6_advmss(unsigned int mtu)
813 {
814         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
815
816         if (mtu < ip6_rt_min_advmss)
817                 mtu = ip6_rt_min_advmss;
818
819         /*
820          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
821          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
822          * IPV6_MAXPLEN is also valid and means: "any MSS, 
823          * rely only on pmtu discovery"
824          */
825         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
826                 mtu = IPV6_MAXPLEN;
827         return mtu;
828 }
829
830 static struct dst_entry *ndisc_dst_gc_list;
831 DEFINE_SPINLOCK(ndisc_lock);
832
833 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
834                                   struct neighbour *neigh,
835                                   struct in6_addr *addr,
836                                   int (*output)(struct sk_buff *))
837 {
838         struct rt6_info *rt;
839         struct inet6_dev *idev = in6_dev_get(dev);
840
841         if (unlikely(idev == NULL))
842                 return NULL;
843
844         rt = ip6_dst_alloc();
845         if (unlikely(rt == NULL)) {
846                 in6_dev_put(idev);
847                 goto out;
848         }
849
850         dev_hold(dev);
851         if (neigh)
852                 neigh_hold(neigh);
853         else
854                 neigh = ndisc_get_neigh(dev, addr);
855
856         rt->rt6i_dev      = dev;
857         rt->rt6i_idev     = idev;
858         rt->rt6i_nexthop  = neigh;
859         atomic_set(&rt->u.dst.__refcnt, 1);
860         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
861         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
862         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
863         rt->u.dst.output  = output;
864
865 #if 0   /* there's no chance to use these for ndisc */
866         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
867                                 ? DST_HOST 
868                                 : 0;
869         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
870         rt->rt6i_dst.plen = 128;
871 #endif
872
873         spin_lock_bh(&ndisc_lock);
874         rt->u.dst.next = ndisc_dst_gc_list;
875         ndisc_dst_gc_list = &rt->u.dst;
876         spin_unlock_bh(&ndisc_lock);
877
878         fib6_force_start_gc();
879
880 out:
881         return (struct dst_entry *)rt;
882 }
883
884 int ndisc_dst_gc(int *more)
885 {
886         struct dst_entry *dst, *next, **pprev;
887         int freed;
888
889         next = NULL;
890         freed = 0;
891
892         spin_lock_bh(&ndisc_lock);
893         pprev = &ndisc_dst_gc_list;
894
895         while ((dst = *pprev) != NULL) {
896                 if (!atomic_read(&dst->__refcnt)) {
897                         *pprev = dst->next;
898                         dst_free(dst);
899                         freed++;
900                 } else {
901                         pprev = &dst->next;
902                         (*more)++;
903                 }
904         }
905
906         spin_unlock_bh(&ndisc_lock);
907
908         return freed;
909 }
910
911 static int ip6_dst_gc(void)
912 {
913         static unsigned expire = 30*HZ;
914         static unsigned long last_gc;
915         unsigned long now = jiffies;
916
917         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
918             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
919                 goto out;
920
921         expire++;
922         fib6_run_gc(expire);
923         last_gc = now;
924         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
925                 expire = ip6_rt_gc_timeout>>1;
926
927 out:
928         expire -= expire>>ip6_rt_gc_elasticity;
929         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
930 }
931
932 /* Clean host part of a prefix. Not necessary in radix tree,
933    but results in cleaner routing tables.
934
935    Remove it only when all the things will work!
936  */
937
938 static int ipv6_get_mtu(struct net_device *dev)
939 {
940         int mtu = IPV6_MIN_MTU;
941         struct inet6_dev *idev;
942
943         idev = in6_dev_get(dev);
944         if (idev) {
945                 mtu = idev->cnf.mtu6;
946                 in6_dev_put(idev);
947         }
948         return mtu;
949 }
950
951 int ipv6_get_hoplimit(struct net_device *dev)
952 {
953         int hoplimit = ipv6_devconf.hop_limit;
954         struct inet6_dev *idev;
955
956         idev = in6_dev_get(dev);
957         if (idev) {
958                 hoplimit = idev->cnf.hop_limit;
959                 in6_dev_put(idev);
960         }
961         return hoplimit;
962 }
963
964 /*
965  *
966  */
967
968 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
969                   void *_rtattr, struct netlink_skb_parms *req,
970                   u32 table_id)
971 {
972         int err;
973         struct rtmsg *r;
974         struct rtattr **rta;
975         struct rt6_info *rt = NULL;
976         struct net_device *dev = NULL;
977         struct inet6_dev *idev = NULL;
978         struct fib6_table *table;
979         int addr_type;
980
981         rta = (struct rtattr **) _rtattr;
982
983         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
984                 return -EINVAL;
985 #ifndef CONFIG_IPV6_SUBTREES
986         if (rtmsg->rtmsg_src_len)
987                 return -EINVAL;
988 #endif
989         if (rtmsg->rtmsg_ifindex) {
990                 err = -ENODEV;
991                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
992                 if (!dev)
993                         goto out;
994                 idev = in6_dev_get(dev);
995                 if (!idev)
996                         goto out;
997         }
998
999         if (rtmsg->rtmsg_metric == 0)
1000                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
1001
1002         table = fib6_new_table(table_id);
1003         if (table == NULL) {
1004                 err = -ENOBUFS;
1005                 goto out;
1006         }
1007
1008         rt = ip6_dst_alloc();
1009
1010         if (rt == NULL) {
1011                 err = -ENOMEM;
1012                 goto out;
1013         }
1014
1015         rt->u.dst.obsolete = -1;
1016         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
1017         if (nlh && (r = NLMSG_DATA(nlh))) {
1018                 rt->rt6i_protocol = r->rtm_protocol;
1019         } else {
1020                 rt->rt6i_protocol = RTPROT_BOOT;
1021         }
1022
1023         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
1024
1025         if (addr_type & IPV6_ADDR_MULTICAST)
1026                 rt->u.dst.input = ip6_mc_input;
1027         else
1028                 rt->u.dst.input = ip6_forward;
1029
1030         rt->u.dst.output = ip6_output;
1031
1032         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
1033                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
1034         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
1035         if (rt->rt6i_dst.plen == 128)
1036                rt->u.dst.flags = DST_HOST;
1037
1038 #ifdef CONFIG_IPV6_SUBTREES
1039         ipv6_addr_prefix(&rt->rt6i_src.addr, 
1040                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1041         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
1042 #endif
1043
1044         rt->rt6i_metric = rtmsg->rtmsg_metric;
1045
1046         /* We cannot add true routes via loopback here,
1047            they would result in kernel looping; promote them to reject routes
1048          */
1049         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
1050             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1051                 /* hold loopback dev/idev if we haven't done so. */
1052                 if (dev != &loopback_dev) {
1053                         if (dev) {
1054                                 dev_put(dev);
1055                                 in6_dev_put(idev);
1056                         }
1057                         dev = &loopback_dev;
1058                         dev_hold(dev);
1059                         idev = in6_dev_get(dev);
1060                         if (!idev) {
1061                                 err = -ENODEV;
1062                                 goto out;
1063                         }
1064                 }
1065                 rt->u.dst.output = ip6_pkt_discard_out;
1066                 rt->u.dst.input = ip6_pkt_discard;
1067                 rt->u.dst.error = -ENETUNREACH;
1068                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1069                 goto install_route;
1070         }
1071
1072         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1073                 struct in6_addr *gw_addr;
1074                 int gwa_type;
1075
1076                 gw_addr = &rtmsg->rtmsg_gateway;
1077                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1078                 gwa_type = ipv6_addr_type(gw_addr);
1079
1080                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1081                         struct rt6_info *grt;
1082
1083                         /* IPv6 strictly inhibits using not link-local
1084                            addresses as nexthop address.
1085                            Otherwise, router will not able to send redirects.
1086                            It is very good, but in some (rare!) circumstances
1087                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1088                            some exceptions. --ANK
1089                          */
1090                         err = -EINVAL;
1091                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1092                                 goto out;
1093
1094                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1095
1096                         err = -EHOSTUNREACH;
1097                         if (grt == NULL)
1098                                 goto out;
1099                         if (dev) {
1100                                 if (dev != grt->rt6i_dev) {
1101                                         dst_release(&grt->u.dst);
1102                                         goto out;
1103                                 }
1104                         } else {
1105                                 dev = grt->rt6i_dev;
1106                                 idev = grt->rt6i_idev;
1107                                 dev_hold(dev);
1108                                 in6_dev_hold(grt->rt6i_idev);
1109                         }
1110                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1111                                 err = 0;
1112                         dst_release(&grt->u.dst);
1113
1114                         if (err)
1115                                 goto out;
1116                 }
1117                 err = -EINVAL;
1118                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1119                         goto out;
1120         }
1121
1122         err = -ENODEV;
1123         if (dev == NULL)
1124                 goto out;
1125
1126         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1127                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1128                 if (IS_ERR(rt->rt6i_nexthop)) {
1129                         err = PTR_ERR(rt->rt6i_nexthop);
1130                         rt->rt6i_nexthop = NULL;
1131                         goto out;
1132                 }
1133         }
1134
1135         rt->rt6i_flags = rtmsg->rtmsg_flags;
1136
1137 install_route:
1138         if (rta && rta[RTA_METRICS-1]) {
1139                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1140                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1141
1142                 while (RTA_OK(attr, attrlen)) {
1143                         unsigned flavor = attr->rta_type;
1144                         if (flavor) {
1145                                 if (flavor > RTAX_MAX) {
1146                                         err = -EINVAL;
1147                                         goto out;
1148                                 }
1149                                 rt->u.dst.metrics[flavor-1] =
1150                                         *(u32 *)RTA_DATA(attr);
1151                         }
1152                         attr = RTA_NEXT(attr, attrlen);
1153                 }
1154         }
1155
1156         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1157                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1158         if (!rt->u.dst.metrics[RTAX_MTU-1])
1159                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1160         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1161                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1162         rt->u.dst.dev = dev;
1163         rt->rt6i_idev = idev;
1164         rt->rt6i_table = table;
1165         return ip6_ins_rt(rt, nlh, _rtattr, req);
1166
1167 out:
1168         if (dev)
1169                 dev_put(dev);
1170         if (idev)
1171                 in6_dev_put(idev);
1172         if (rt)
1173                 dst_free((struct dst_entry *) rt);
1174         return err;
1175 }
1176
1177 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1178 {
1179         int err;
1180         struct fib6_table *table;
1181
1182         table = rt->rt6i_table;
1183         write_lock_bh(&table->tb6_lock);
1184
1185         err = fib6_del(rt, nlh, _rtattr, req);
1186         dst_release(&rt->u.dst);
1187
1188         write_unlock_bh(&table->tb6_lock);
1189
1190         return err;
1191 }
1192
1193 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
1194                          void *_rtattr, struct netlink_skb_parms *req,
1195                          u32 table_id)
1196 {
1197         struct fib6_table *table;
1198         struct fib6_node *fn;
1199         struct rt6_info *rt;
1200         int err = -ESRCH;
1201
1202         table = fib6_get_table(table_id);
1203         if (table == NULL)
1204                 return err;
1205
1206         read_lock_bh(&table->tb6_lock);
1207
1208         fn = fib6_locate(&table->tb6_root,
1209                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1210                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1211         
1212         if (fn) {
1213                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1214                         if (rtmsg->rtmsg_ifindex &&
1215                             (rt->rt6i_dev == NULL ||
1216                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1217                                 continue;
1218                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1219                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1220                                 continue;
1221                         if (rtmsg->rtmsg_metric &&
1222                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1223                                 continue;
1224                         dst_hold(&rt->u.dst);
1225                         read_unlock_bh(&table->tb6_lock);
1226
1227                         return ip6_del_rt(rt, nlh, _rtattr, req);
1228                 }
1229         }
1230         read_unlock_bh(&table->tb6_lock);
1231
1232         return err;
1233 }
1234
1235 /*
1236  *      Handle redirects
1237  */
1238 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1239                   struct neighbour *neigh, u8 *lladdr, int on_link)
1240 {
1241         struct rt6_info *rt, *nrt = NULL;
1242         struct fib6_node *fn;
1243         struct fib6_table *table;
1244         struct netevent_redirect netevent;
1245
1246         /* TODO: Very lazy, might need to check all tables */
1247         table = fib6_get_table(RT6_TABLE_MAIN);
1248         if (table == NULL)
1249                 return;
1250
1251         /*
1252          * Get the "current" route for this destination and
1253          * check if the redirect has come from approriate router.
1254          *
1255          * RFC 2461 specifies that redirects should only be
1256          * accepted if they come from the nexthop to the target.
1257          * Due to the way the routes are chosen, this notion
1258          * is a bit fuzzy and one might need to check all possible
1259          * routes.
1260          */
1261
1262         read_lock_bh(&table->tb6_lock);
1263         fn = fib6_lookup(&table->tb6_root, dest, NULL);
1264 restart:
1265         for (rt = fn->leaf; rt; rt = rt->u.next) {
1266                 /*
1267                  * Current route is on-link; redirect is always invalid.
1268                  *
1269                  * Seems, previous statement is not true. It could
1270                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1271                  * But then router serving it might decide, that we should
1272                  * know truth 8)8) --ANK (980726).
1273                  */
1274                 if (rt6_check_expired(rt))
1275                         continue;
1276                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1277                         continue;
1278                 if (neigh->dev != rt->rt6i_dev)
1279                         continue;
1280                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1281                         continue;
1282                 break;
1283         }
1284         if (rt)
1285                 dst_hold(&rt->u.dst);
1286         else if (rt6_need_strict(dest)) {
1287                 while ((fn = fn->parent) != NULL) {
1288                         if (fn->fn_flags & RTN_ROOT)
1289                                 break;
1290                         if (fn->fn_flags & RTN_RTINFO)
1291                                 goto restart;
1292                 }
1293         }
1294         read_unlock_bh(&table->tb6_lock);
1295
1296         if (!rt) {
1297                 if (net_ratelimit())
1298                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1299                                "for redirect target\n");
1300                 return;
1301         }
1302
1303         /*
1304          *      We have finally decided to accept it.
1305          */
1306
1307         neigh_update(neigh, lladdr, NUD_STALE, 
1308                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1309                      NEIGH_UPDATE_F_OVERRIDE|
1310                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1311                                      NEIGH_UPDATE_F_ISROUTER))
1312                      );
1313
1314         /*
1315          * Redirect received -> path was valid.
1316          * Look, redirects are sent only in response to data packets,
1317          * so that this nexthop apparently is reachable. --ANK
1318          */
1319         dst_confirm(&rt->u.dst);
1320
1321         /* Duplicate redirect: silently ignore. */
1322         if (neigh == rt->u.dst.neighbour)
1323                 goto out;
1324
1325         nrt = ip6_rt_copy(rt);
1326         if (nrt == NULL)
1327                 goto out;
1328
1329         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1330         if (on_link)
1331                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1332
1333         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1334         nrt->rt6i_dst.plen = 128;
1335         nrt->u.dst.flags |= DST_HOST;
1336
1337         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1338         nrt->rt6i_nexthop = neigh_clone(neigh);
1339         /* Reset pmtu, it may be better */
1340         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1341         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1342
1343         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1344                 goto out;
1345
1346         netevent.old = &rt->u.dst;
1347         netevent.new = &nrt->u.dst;
1348         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1349
1350         if (rt->rt6i_flags&RTF_CACHE) {
1351                 ip6_del_rt(rt, NULL, NULL, NULL);
1352                 return;
1353         }
1354
1355 out:
1356         dst_release(&rt->u.dst);
1357         return;
1358 }
1359
1360 /*
1361  *      Handle ICMP "packet too big" messages
1362  *      i.e. Path MTU discovery
1363  */
1364
1365 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1366                         struct net_device *dev, u32 pmtu)
1367 {
1368         struct rt6_info *rt, *nrt;
1369         int allfrag = 0;
1370
1371         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1372         if (rt == NULL)
1373                 return;
1374
1375         if (pmtu >= dst_mtu(&rt->u.dst))
1376                 goto out;
1377
1378         if (pmtu < IPV6_MIN_MTU) {
1379                 /*
1380                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1381                  * MTU (1280) and a fragment header should always be included
1382                  * after a node receiving Too Big message reporting PMTU is
1383                  * less than the IPv6 Minimum Link MTU.
1384                  */
1385                 pmtu = IPV6_MIN_MTU;
1386                 allfrag = 1;
1387         }
1388
1389         /* New mtu received -> path was valid.
1390            They are sent only in response to data packets,
1391            so that this nexthop apparently is reachable. --ANK
1392          */
1393         dst_confirm(&rt->u.dst);
1394
1395         /* Host route. If it is static, it would be better
1396            not to override it, but add new one, so that
1397            when cache entry will expire old pmtu
1398            would return automatically.
1399          */
1400         if (rt->rt6i_flags & RTF_CACHE) {
1401                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1402                 if (allfrag)
1403                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1404                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1405                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1406                 goto out;
1407         }
1408
1409         /* Network route.
1410            Two cases are possible:
1411            1. It is connected route. Action: COW
1412            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1413          */
1414         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1415                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1416         else
1417                 nrt = rt6_alloc_clone(rt, daddr);
1418
1419         if (nrt) {
1420                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1421                 if (allfrag)
1422                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1423
1424                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1425                  * happened within 5 mins, the recommended timer is 10 mins.
1426                  * Here this route expiration time is set to ip6_rt_mtu_expires
1427                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1428                  * and detecting PMTU increase will be automatically happened.
1429                  */
1430                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1431                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1432
1433                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1434         }
1435 out:
1436         dst_release(&rt->u.dst);
1437 }
1438
1439 /*
1440  *      Misc support functions
1441  */
1442
1443 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1444 {
1445         struct rt6_info *rt = ip6_dst_alloc();
1446
1447         if (rt) {
1448                 rt->u.dst.input = ort->u.dst.input;
1449                 rt->u.dst.output = ort->u.dst.output;
1450
1451                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1452                 rt->u.dst.dev = ort->u.dst.dev;
1453                 if (rt->u.dst.dev)
1454                         dev_hold(rt->u.dst.dev);
1455                 rt->rt6i_idev = ort->rt6i_idev;
1456                 if (rt->rt6i_idev)
1457                         in6_dev_hold(rt->rt6i_idev);
1458                 rt->u.dst.lastuse = jiffies;
1459                 rt->rt6i_expires = 0;
1460
1461                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1462                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1463                 rt->rt6i_metric = 0;
1464
1465                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1466 #ifdef CONFIG_IPV6_SUBTREES
1467                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1468 #endif
1469                 rt->rt6i_table = ort->rt6i_table;
1470         }
1471         return rt;
1472 }
1473
1474 #ifdef CONFIG_IPV6_ROUTE_INFO
1475 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1476                                            struct in6_addr *gwaddr, int ifindex)
1477 {
1478         struct fib6_node *fn;
1479         struct rt6_info *rt = NULL;
1480         struct fib6_table *table;
1481
1482         table = fib6_get_table(RT6_TABLE_INFO);
1483         if (table == NULL)
1484                 return NULL;
1485
1486         write_lock_bh(&table->tb6_lock);
1487         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1488         if (!fn)
1489                 goto out;
1490
1491         for (rt = fn->leaf; rt; rt = rt->u.next) {
1492                 if (rt->rt6i_dev->ifindex != ifindex)
1493                         continue;
1494                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1495                         continue;
1496                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1497                         continue;
1498                 dst_hold(&rt->u.dst);
1499                 break;
1500         }
1501 out:
1502         write_unlock_bh(&table->tb6_lock);
1503         return rt;
1504 }
1505
1506 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1507                                            struct in6_addr *gwaddr, int ifindex,
1508                                            unsigned pref)
1509 {
1510         struct in6_rtmsg rtmsg;
1511
1512         memset(&rtmsg, 0, sizeof(rtmsg));
1513         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1514         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1515         rtmsg.rtmsg_dst_len = prefixlen;
1516         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1517         rtmsg.rtmsg_metric = 1024;
1518         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1519         /* We should treat it as a default route if prefix length is 0. */
1520         if (!prefixlen)
1521                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1522         rtmsg.rtmsg_ifindex = ifindex;
1523
1524         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
1525
1526         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1527 }
1528 #endif
1529
1530 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1531 {       
1532         struct rt6_info *rt;
1533         struct fib6_table *table;
1534
1535         table = fib6_get_table(RT6_TABLE_DFLT);
1536         if (table == NULL)
1537                 return NULL;
1538
1539         write_lock_bh(&table->tb6_lock);
1540         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1541                 if (dev == rt->rt6i_dev &&
1542                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1543                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1544                         break;
1545         }
1546         if (rt)
1547                 dst_hold(&rt->u.dst);
1548         write_unlock_bh(&table->tb6_lock);
1549         return rt;
1550 }
1551
1552 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1553                                      struct net_device *dev,
1554                                      unsigned int pref)
1555 {
1556         struct in6_rtmsg rtmsg;
1557
1558         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1559         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1560         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1561         rtmsg.rtmsg_metric = 1024;
1562         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1563                             RTF_PREF(pref);
1564
1565         rtmsg.rtmsg_ifindex = dev->ifindex;
1566
1567         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
1568         return rt6_get_dflt_router(gwaddr, dev);
1569 }
1570
1571 void rt6_purge_dflt_routers(void)
1572 {
1573         struct rt6_info *rt;
1574         struct fib6_table *table;
1575
1576         /* NOTE: Keep consistent with rt6_get_dflt_router */
1577         table = fib6_get_table(RT6_TABLE_DFLT);
1578         if (table == NULL)
1579                 return;
1580
1581 restart:
1582         read_lock_bh(&table->tb6_lock);
1583         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1584                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1585                         dst_hold(&rt->u.dst);
1586                         read_unlock_bh(&table->tb6_lock);
1587                         ip6_del_rt(rt, NULL, NULL, NULL);
1588                         goto restart;
1589                 }
1590         }
1591         read_unlock_bh(&table->tb6_lock);
1592 }
1593
1594 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1595 {
1596         struct in6_rtmsg rtmsg;
1597         int err;
1598
1599         switch(cmd) {
1600         case SIOCADDRT:         /* Add a route */
1601         case SIOCDELRT:         /* Delete a route */
1602                 if (!capable(CAP_NET_ADMIN))
1603                         return -EPERM;
1604                 err = copy_from_user(&rtmsg, arg,
1605                                      sizeof(struct in6_rtmsg));
1606                 if (err)
1607                         return -EFAULT;
1608                         
1609                 rtnl_lock();
1610                 switch (cmd) {
1611                 case SIOCADDRT:
1612                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
1613                                             RT6_TABLE_MAIN);
1614                         break;
1615                 case SIOCDELRT:
1616                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
1617                                             RT6_TABLE_MAIN);
1618                         break;
1619                 default:
1620                         err = -EINVAL;
1621                 }
1622                 rtnl_unlock();
1623
1624                 return err;
1625         };
1626
1627         return -EINVAL;
1628 }
1629
1630 /*
1631  *      Drop the packet on the floor
1632  */
1633
1634 static int ip6_pkt_discard(struct sk_buff *skb)
1635 {
1636         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1637         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1638                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1639
1640         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1641         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1642         kfree_skb(skb);
1643         return 0;
1644 }
1645
1646 static int ip6_pkt_discard_out(struct sk_buff *skb)
1647 {
1648         skb->dev = skb->dst->dev;
1649         return ip6_pkt_discard(skb);
1650 }
1651
1652 /*
1653  *      Allocate a dst for local (unicast / anycast) address.
1654  */
1655
1656 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1657                                     const struct in6_addr *addr,
1658                                     int anycast)
1659 {
1660         struct rt6_info *rt = ip6_dst_alloc();
1661
1662         if (rt == NULL)
1663                 return ERR_PTR(-ENOMEM);
1664
1665         dev_hold(&loopback_dev);
1666         in6_dev_hold(idev);
1667
1668         rt->u.dst.flags = DST_HOST;
1669         rt->u.dst.input = ip6_input;
1670         rt->u.dst.output = ip6_output;
1671         rt->rt6i_dev = &loopback_dev;
1672         rt->rt6i_idev = idev;
1673         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1674         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1675         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1676         rt->u.dst.obsolete = -1;
1677
1678         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1679         if (anycast)
1680                 rt->rt6i_flags |= RTF_ANYCAST;
1681         else
1682                 rt->rt6i_flags |= RTF_LOCAL;
1683         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1684         if (rt->rt6i_nexthop == NULL) {
1685                 dst_free((struct dst_entry *) rt);
1686                 return ERR_PTR(-ENOMEM);
1687         }
1688
1689         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1690         rt->rt6i_dst.plen = 128;
1691         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1692
1693         atomic_set(&rt->u.dst.__refcnt, 1);
1694
1695         return rt;
1696 }
1697
1698 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1699 {
1700         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1701             rt != &ip6_null_entry) {
1702                 RT6_TRACE("deleted by ifdown %p\n", rt);
1703                 return -1;
1704         }
1705         return 0;
1706 }
1707
1708 void rt6_ifdown(struct net_device *dev)
1709 {
1710         fib6_clean_all(fib6_ifdown, 0, dev);
1711 }
1712
1713 struct rt6_mtu_change_arg
1714 {
1715         struct net_device *dev;
1716         unsigned mtu;
1717 };
1718
1719 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1720 {
1721         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1722         struct inet6_dev *idev;
1723
1724         /* In IPv6 pmtu discovery is not optional,
1725            so that RTAX_MTU lock cannot disable it.
1726            We still use this lock to block changes
1727            caused by addrconf/ndisc.
1728         */
1729
1730         idev = __in6_dev_get(arg->dev);
1731         if (idev == NULL)
1732                 return 0;
1733
1734         /* For administrative MTU increase, there is no way to discover
1735            IPv6 PMTU increase, so PMTU increase should be updated here.
1736            Since RFC 1981 doesn't include administrative MTU increase
1737            update PMTU increase is a MUST. (i.e. jumbo frame)
1738          */
1739         /*
1740            If new MTU is less than route PMTU, this new MTU will be the
1741            lowest MTU in the path, update the route PMTU to reflect PMTU
1742            decreases; if new MTU is greater than route PMTU, and the
1743            old MTU is the lowest MTU in the path, update the route PMTU
1744            to reflect the increase. In this case if the other nodes' MTU
1745            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1746            PMTU discouvery.
1747          */
1748         if (rt->rt6i_dev == arg->dev &&
1749             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1750             (dst_mtu(&rt->u.dst) > arg->mtu ||
1751              (dst_mtu(&rt->u.dst) < arg->mtu &&
1752               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1753                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1754         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1755         return 0;
1756 }
1757
1758 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1759 {
1760         struct rt6_mtu_change_arg arg = {
1761                 .dev = dev,
1762                 .mtu = mtu,
1763         };
1764
1765         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1766 }
1767
1768 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1769                               struct in6_rtmsg *rtmsg)
1770 {
1771         memset(rtmsg, 0, sizeof(*rtmsg));
1772
1773         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1774         rtmsg->rtmsg_src_len = r->rtm_src_len;
1775         rtmsg->rtmsg_flags = RTF_UP;
1776         if (r->rtm_type == RTN_UNREACHABLE)
1777                 rtmsg->rtmsg_flags |= RTF_REJECT;
1778
1779         if (rta[RTA_GATEWAY-1]) {
1780                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1781                         return -EINVAL;
1782                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1783                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1784         }
1785         if (rta[RTA_DST-1]) {
1786                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1787                         return -EINVAL;
1788                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1789         }
1790         if (rta[RTA_SRC-1]) {
1791                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1792                         return -EINVAL;
1793                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1794         }
1795         if (rta[RTA_OIF-1]) {
1796                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1797                         return -EINVAL;
1798                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1799         }
1800         if (rta[RTA_PRIORITY-1]) {
1801                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1802                         return -EINVAL;
1803                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1804         }
1805         return 0;
1806 }
1807
1808 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1809 {
1810         struct rtmsg *r = NLMSG_DATA(nlh);
1811         struct in6_rtmsg rtmsg;
1812
1813         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1814                 return -EINVAL;
1815         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
1816 }
1817
1818 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1819 {
1820         struct rtmsg *r = NLMSG_DATA(nlh);
1821         struct in6_rtmsg rtmsg;
1822
1823         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1824                 return -EINVAL;
1825         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
1826 }
1827
1828 struct rt6_rtnl_dump_arg
1829 {
1830         struct sk_buff *skb;
1831         struct netlink_callback *cb;
1832 };
1833
1834 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1835                          struct in6_addr *dst, struct in6_addr *src,
1836                          int iif, int type, u32 pid, u32 seq,
1837                          int prefix, unsigned int flags)
1838 {
1839         struct rtmsg *rtm;
1840         struct nlmsghdr  *nlh;
1841         unsigned char    *b = skb->tail;
1842         struct rta_cacheinfo ci;
1843
1844         if (prefix) {   /* user wants prefix routes only */
1845                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1846                         /* success since this is not a prefix route */
1847                         return 1;
1848                 }
1849         }
1850
1851         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1852         rtm = NLMSG_DATA(nlh);
1853         rtm->rtm_family = AF_INET6;
1854         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1855         rtm->rtm_src_len = rt->rt6i_src.plen;
1856         rtm->rtm_tos = 0;
1857         if (rt->rt6i_table)
1858                 rtm->rtm_table = rt->rt6i_table->tb6_id;
1859         else
1860                 rtm->rtm_table = RT6_TABLE_UNSPEC;
1861         rtm->rtm_table = RT_TABLE_MAIN;
1862         if (rt->rt6i_flags&RTF_REJECT)
1863                 rtm->rtm_type = RTN_UNREACHABLE;
1864         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1865                 rtm->rtm_type = RTN_LOCAL;
1866         else
1867                 rtm->rtm_type = RTN_UNICAST;
1868         rtm->rtm_flags = 0;
1869         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1870         rtm->rtm_protocol = rt->rt6i_protocol;
1871         if (rt->rt6i_flags&RTF_DYNAMIC)
1872                 rtm->rtm_protocol = RTPROT_REDIRECT;
1873         else if (rt->rt6i_flags & RTF_ADDRCONF)
1874                 rtm->rtm_protocol = RTPROT_KERNEL;
1875         else if (rt->rt6i_flags&RTF_DEFAULT)
1876                 rtm->rtm_protocol = RTPROT_RA;
1877
1878         if (rt->rt6i_flags&RTF_CACHE)
1879                 rtm->rtm_flags |= RTM_F_CLONED;
1880
1881         if (dst) {
1882                 RTA_PUT(skb, RTA_DST, 16, dst);
1883                 rtm->rtm_dst_len = 128;
1884         } else if (rtm->rtm_dst_len)
1885                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1886 #ifdef CONFIG_IPV6_SUBTREES
1887         if (src) {
1888                 RTA_PUT(skb, RTA_SRC, 16, src);
1889                 rtm->rtm_src_len = 128;
1890         } else if (rtm->rtm_src_len)
1891                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1892 #endif
1893         if (iif)
1894                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1895         else if (dst) {
1896                 struct in6_addr saddr_buf;
1897                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1898                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1899         }
1900         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1901                 goto rtattr_failure;
1902         if (rt->u.dst.neighbour)
1903                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1904         if (rt->u.dst.dev)
1905                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1906         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1907         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1908         if (rt->rt6i_expires)
1909                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1910         else
1911                 ci.rta_expires = 0;
1912         ci.rta_used = rt->u.dst.__use;
1913         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1914         ci.rta_error = rt->u.dst.error;
1915         ci.rta_id = 0;
1916         ci.rta_ts = 0;
1917         ci.rta_tsage = 0;
1918         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1919         nlh->nlmsg_len = skb->tail - b;
1920         return skb->len;
1921
1922 nlmsg_failure:
1923 rtattr_failure:
1924         skb_trim(skb, b - skb->data);
1925         return -1;
1926 }
1927
1928 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1929 {
1930         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1931         int prefix;
1932
1933         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1934                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1935                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1936         } else
1937                 prefix = 0;
1938
1939         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1940                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1941                      prefix, NLM_F_MULTI);
1942 }
1943
1944 static int fib6_dump_node(struct fib6_walker_t *w)
1945 {
1946         int res;
1947         struct rt6_info *rt;
1948
1949         for (rt = w->leaf; rt; rt = rt->u.next) {
1950                 res = rt6_dump_route(rt, w->args);
1951                 if (res < 0) {
1952                         /* Frame is full, suspend walking */
1953                         w->leaf = rt;
1954                         return 1;
1955                 }
1956                 BUG_TRAP(res!=0);
1957         }
1958         w->leaf = NULL;
1959         return 0;
1960 }
1961
1962 static void fib6_dump_end(struct netlink_callback *cb)
1963 {
1964         struct fib6_walker_t *w = (void*)cb->args[0];
1965
1966         if (w) {
1967                 cb->args[0] = 0;
1968                 kfree(w);
1969         }
1970         cb->done = (void*)cb->args[1];
1971         cb->args[1] = 0;
1972 }
1973
1974 static int fib6_dump_done(struct netlink_callback *cb)
1975 {
1976         fib6_dump_end(cb);
1977         return cb->done ? cb->done(cb) : 0;
1978 }
1979
1980 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1981 {
1982         struct fib6_table *table;
1983         struct rt6_rtnl_dump_arg arg;
1984         struct fib6_walker_t *w;
1985         int i, res = 0;
1986
1987         arg.skb = skb;
1988         arg.cb = cb;
1989
1990         /*
1991          * cb->args[0] = pointer to walker structure
1992          * cb->args[1] = saved cb->done() pointer
1993          * cb->args[2] = current table being dumped
1994          */
1995
1996         w = (void*)cb->args[0];
1997         if (w == NULL) {
1998                 /* New dump:
1999                  * 
2000                  * 1. hook callback destructor.
2001                  */
2002                 cb->args[1] = (long)cb->done;
2003                 cb->done = fib6_dump_done;
2004
2005                 /*
2006                  * 2. allocate and initialize walker.
2007                  */
2008                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
2009                 if (w == NULL)
2010                         return -ENOMEM;
2011                 w->func = fib6_dump_node;
2012                 w->args = &arg;
2013                 cb->args[0] = (long)w;
2014                 cb->args[2] = FIB6_TABLE_MIN;
2015         } else {
2016                 w->args = &arg;
2017                 i = cb->args[2];
2018                 if (i > FIB6_TABLE_MAX)
2019                         goto end;
2020
2021                 table = fib6_get_table(i);
2022                 if (table != NULL) {
2023                         read_lock_bh(&table->tb6_lock);
2024                         w->root = &table->tb6_root;
2025                         res = fib6_walk_continue(w);
2026                         read_unlock_bh(&table->tb6_lock);
2027                         if (res != 0) {
2028                                 if (res < 0)
2029                                         fib6_walker_unlink(w);
2030                                 goto end;
2031                         }
2032                 }
2033
2034                 fib6_walker_unlink(w);
2035                 cb->args[2] = ++i;
2036         }
2037
2038         for (i = cb->args[2]; i <= FIB6_TABLE_MAX; i++) {
2039                 table = fib6_get_table(i);
2040                 if (table == NULL)
2041                         continue;
2042
2043                 read_lock_bh(&table->tb6_lock);
2044                 w->root = &table->tb6_root;
2045                 res = fib6_walk(w);
2046                 read_unlock_bh(&table->tb6_lock);
2047                 if (res)
2048                         break;
2049         }
2050 end:
2051         cb->args[2] = i;
2052
2053         res = res < 0 ? res : skb->len;
2054         /* res < 0 is an error. (really, impossible)
2055            res == 0 means that dump is complete, but skb still can contain data.
2056            res > 0 dump is not complete, but frame is full.
2057          */
2058         /* Destroy walker, if dump of this table is complete. */
2059         if (res <= 0)
2060                 fib6_dump_end(cb);
2061         return res;
2062 }
2063
2064 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2065 {
2066         struct rtattr **rta = arg;
2067         int iif = 0;
2068         int err = -ENOBUFS;
2069         struct sk_buff *skb;
2070         struct flowi fl;
2071         struct rt6_info *rt;
2072
2073         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2074         if (skb == NULL)
2075                 goto out;
2076
2077         /* Reserve room for dummy headers, this skb can pass
2078            through good chunk of routing engine.
2079          */
2080         skb->mac.raw = skb->data;
2081         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2082
2083         memset(&fl, 0, sizeof(fl));
2084         if (rta[RTA_SRC-1])
2085                 ipv6_addr_copy(&fl.fl6_src,
2086                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
2087         if (rta[RTA_DST-1])
2088                 ipv6_addr_copy(&fl.fl6_dst,
2089                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
2090
2091         if (rta[RTA_IIF-1])
2092                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2093
2094         if (iif) {
2095                 struct net_device *dev;
2096                 dev = __dev_get_by_index(iif);
2097                 if (!dev) {
2098                         err = -ENODEV;
2099                         goto out_free;
2100                 }
2101         }
2102
2103         fl.oif = 0;
2104         if (rta[RTA_OIF-1])
2105                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2106
2107         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
2108
2109         skb->dst = &rt->u.dst;
2110
2111         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2112         err = rt6_fill_node(skb, rt, 
2113                             &fl.fl6_dst, &fl.fl6_src,
2114                             iif,
2115                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2116                             nlh->nlmsg_seq, 0, 0);
2117         if (err < 0) {
2118                 err = -EMSGSIZE;
2119                 goto out_free;
2120         }
2121
2122         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2123         if (err > 0)
2124                 err = 0;
2125 out:
2126         return err;
2127 out_free:
2128         kfree_skb(skb);
2129         goto out;       
2130 }
2131
2132 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
2133                         struct netlink_skb_parms *req)
2134 {
2135         struct sk_buff *skb;
2136         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2137         u32 pid = current->pid;
2138         u32 seq = 0;
2139
2140         if (req)
2141                 pid = req->pid;
2142         if (nlh)
2143                 seq = nlh->nlmsg_seq;
2144         
2145         skb = alloc_skb(size, gfp_any());
2146         if (!skb) {
2147                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2148                 return;
2149         }
2150         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2151                 kfree_skb(skb);
2152                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2153                 return;
2154         }
2155         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2156         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2157 }
2158
2159 /*
2160  *      /proc
2161  */
2162
2163 #ifdef CONFIG_PROC_FS
2164
2165 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2166
2167 struct rt6_proc_arg
2168 {
2169         char *buffer;
2170         int offset;
2171         int length;
2172         int skip;
2173         int len;
2174 };
2175
2176 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2177 {
2178         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2179         int i;
2180
2181         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2182                 arg->skip++;
2183                 return 0;
2184         }
2185
2186         if (arg->len >= arg->length)
2187                 return 0;
2188
2189         for (i=0; i<16; i++) {
2190                 sprintf(arg->buffer + arg->len, "%02x",
2191                         rt->rt6i_dst.addr.s6_addr[i]);
2192                 arg->len += 2;
2193         }
2194         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2195                             rt->rt6i_dst.plen);
2196
2197 #ifdef CONFIG_IPV6_SUBTREES
2198         for (i=0; i<16; i++) {
2199                 sprintf(arg->buffer + arg->len, "%02x",
2200                         rt->rt6i_src.addr.s6_addr[i]);
2201                 arg->len += 2;
2202         }
2203         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2204                             rt->rt6i_src.plen);
2205 #else
2206         sprintf(arg->buffer + arg->len,
2207                 "00000000000000000000000000000000 00 ");
2208         arg->len += 36;
2209 #endif
2210
2211         if (rt->rt6i_nexthop) {
2212                 for (i=0; i<16; i++) {
2213                         sprintf(arg->buffer + arg->len, "%02x",
2214                                 rt->rt6i_nexthop->primary_key[i]);
2215                         arg->len += 2;
2216                 }
2217         } else {
2218                 sprintf(arg->buffer + arg->len,
2219                         "00000000000000000000000000000000");
2220                 arg->len += 32;
2221         }
2222         arg->len += sprintf(arg->buffer + arg->len,
2223                             " %08x %08x %08x %08x %8s\n",
2224                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2225                             rt->u.dst.__use, rt->rt6i_flags, 
2226                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2227         return 0;
2228 }
2229
2230 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2231 {
2232         struct rt6_proc_arg arg = {
2233                 .buffer = buffer,
2234                 .offset = offset,
2235                 .length = length,
2236         };
2237
2238         fib6_clean_all(rt6_info_route, 0, &arg);
2239
2240         *start = buffer;
2241         if (offset)
2242                 *start += offset % RT6_INFO_LEN;
2243
2244         arg.len -= offset % RT6_INFO_LEN;
2245
2246         if (arg.len > length)
2247                 arg.len = length;
2248         if (arg.len < 0)
2249                 arg.len = 0;
2250
2251         return arg.len;
2252 }
2253
2254 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2255 {
2256         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2257                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2258                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2259                       rt6_stats.fib_rt_cache,
2260                       atomic_read(&ip6_dst_ops.entries),
2261                       rt6_stats.fib_discarded_routes);
2262
2263         return 0;
2264 }
2265
2266 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2267 {
2268         return single_open(file, rt6_stats_seq_show, NULL);
2269 }
2270
2271 static struct file_operations rt6_stats_seq_fops = {
2272         .owner   = THIS_MODULE,
2273         .open    = rt6_stats_seq_open,
2274         .read    = seq_read,
2275         .llseek  = seq_lseek,
2276         .release = single_release,
2277 };
2278 #endif  /* CONFIG_PROC_FS */
2279
2280 #ifdef CONFIG_SYSCTL
2281
2282 static int flush_delay;
2283
2284 static
2285 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2286                               void __user *buffer, size_t *lenp, loff_t *ppos)
2287 {
2288         if (write) {
2289                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2290                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2291                 return 0;
2292         } else
2293                 return -EINVAL;
2294 }
2295
2296 ctl_table ipv6_route_table[] = {
2297         {
2298                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2299                 .procname       =       "flush",
2300                 .data           =       &flush_delay,
2301                 .maxlen         =       sizeof(int),
2302                 .mode           =       0200,
2303                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2304         },
2305         {
2306                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2307                 .procname       =       "gc_thresh",
2308                 .data           =       &ip6_dst_ops.gc_thresh,
2309                 .maxlen         =       sizeof(int),
2310                 .mode           =       0644,
2311                 .proc_handler   =       &proc_dointvec,
2312         },
2313         {
2314                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2315                 .procname       =       "max_size",
2316                 .data           =       &ip6_rt_max_size,
2317                 .maxlen         =       sizeof(int),
2318                 .mode           =       0644,
2319                 .proc_handler   =       &proc_dointvec,
2320         },
2321         {
2322                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2323                 .procname       =       "gc_min_interval",
2324                 .data           =       &ip6_rt_gc_min_interval,
2325                 .maxlen         =       sizeof(int),
2326                 .mode           =       0644,
2327                 .proc_handler   =       &proc_dointvec_jiffies,
2328                 .strategy       =       &sysctl_jiffies,
2329         },
2330         {
2331                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2332                 .procname       =       "gc_timeout",
2333                 .data           =       &ip6_rt_gc_timeout,
2334                 .maxlen         =       sizeof(int),
2335                 .mode           =       0644,
2336                 .proc_handler   =       &proc_dointvec_jiffies,
2337                 .strategy       =       &sysctl_jiffies,
2338         },
2339         {
2340                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2341                 .procname       =       "gc_interval",
2342                 .data           =       &ip6_rt_gc_interval,
2343                 .maxlen         =       sizeof(int),
2344                 .mode           =       0644,
2345                 .proc_handler   =       &proc_dointvec_jiffies,
2346                 .strategy       =       &sysctl_jiffies,
2347         },
2348         {
2349                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2350                 .procname       =       "gc_elasticity",
2351                 .data           =       &ip6_rt_gc_elasticity,
2352                 .maxlen         =       sizeof(int),
2353                 .mode           =       0644,
2354                 .proc_handler   =       &proc_dointvec_jiffies,
2355                 .strategy       =       &sysctl_jiffies,
2356         },
2357         {
2358                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2359                 .procname       =       "mtu_expires",
2360                 .data           =       &ip6_rt_mtu_expires,
2361                 .maxlen         =       sizeof(int),
2362                 .mode           =       0644,
2363                 .proc_handler   =       &proc_dointvec_jiffies,
2364                 .strategy       =       &sysctl_jiffies,
2365         },
2366         {
2367                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2368                 .procname       =       "min_adv_mss",
2369                 .data           =       &ip6_rt_min_advmss,
2370                 .maxlen         =       sizeof(int),
2371                 .mode           =       0644,
2372                 .proc_handler   =       &proc_dointvec_jiffies,
2373                 .strategy       =       &sysctl_jiffies,
2374         },
2375         {
2376                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2377                 .procname       =       "gc_min_interval_ms",
2378                 .data           =       &ip6_rt_gc_min_interval,
2379                 .maxlen         =       sizeof(int),
2380                 .mode           =       0644,
2381                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2382                 .strategy       =       &sysctl_ms_jiffies,
2383         },
2384         { .ctl_name = 0 }
2385 };
2386
2387 #endif
2388
2389 void __init ip6_route_init(void)
2390 {
2391         struct proc_dir_entry *p;
2392
2393         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2394                                                      sizeof(struct rt6_info),
2395                                                      0, SLAB_HWCACHE_ALIGN,
2396                                                      NULL, NULL);
2397         if (!ip6_dst_ops.kmem_cachep)
2398                 panic("cannot create ip6_dst_cache");
2399
2400         fib6_init();
2401 #ifdef  CONFIG_PROC_FS
2402         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2403         if (p)
2404                 p->owner = THIS_MODULE;
2405
2406         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2407 #endif
2408 #ifdef CONFIG_XFRM
2409         xfrm6_init();
2410 #endif
2411 }
2412
2413 void ip6_route_cleanup(void)
2414 {
2415 #ifdef CONFIG_PROC_FS
2416         proc_net_remove("ipv6_route");
2417         proc_net_remove("rt6_stats");
2418 #endif
2419 #ifdef CONFIG_XFRM
2420         xfrm6_fini();
2421 #endif
2422         rt6_ifdown(NULL);
2423         fib6_gc_cleanup();
2424         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2425 }