]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.25
[linux-2.6-omap-h63xx.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(struct dst_ops *ops);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
652 {
653         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
654 }
655
656 /*
657  * Perform a full scan of hash table and free all entries.
658  * Can be called by a softirq or a process.
659  * In the later case, we want to be reschedule if necessary
660  */
661 static void rt_do_flush(int process_context)
662 {
663         unsigned int i;
664         struct rtable *rth, *next;
665
666         for (i = 0; i <= rt_hash_mask; i++) {
667                 if (process_context && need_resched())
668                         cond_resched();
669                 rth = rt_hash_table[i].chain;
670                 if (!rth)
671                         continue;
672
673                 spin_lock_bh(rt_hash_lock_addr(i));
674                 rth = rt_hash_table[i].chain;
675                 rt_hash_table[i].chain = NULL;
676                 spin_unlock_bh(rt_hash_lock_addr(i));
677
678                 for (; rth; rth = next) {
679                         next = rth->u.dst.rt_next;
680                         rt_free(rth);
681                 }
682         }
683 }
684
685 static void rt_check_expire(void)
686 {
687         static unsigned int rover;
688         unsigned int i = rover, goal;
689         struct rtable *rth, **rthp;
690         u64 mult;
691
692         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
693         if (ip_rt_gc_timeout > 1)
694                 do_div(mult, ip_rt_gc_timeout);
695         goal = (unsigned int)mult;
696         if (goal > rt_hash_mask)
697                 goal = rt_hash_mask + 1;
698         for (; goal > 0; goal--) {
699                 unsigned long tmo = ip_rt_gc_timeout;
700
701                 i = (i + 1) & rt_hash_mask;
702                 rthp = &rt_hash_table[i].chain;
703
704                 if (need_resched())
705                         cond_resched();
706
707                 if (*rthp == NULL)
708                         continue;
709                 spin_lock_bh(rt_hash_lock_addr(i));
710                 while ((rth = *rthp) != NULL) {
711                         if (rth->u.dst.expires) {
712                                 /* Entry is expired even if it is in use */
713                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
714                                         tmo >>= 1;
715                                         rthp = &rth->u.dst.rt_next;
716                                         continue;
717                                 }
718                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
719                                 tmo >>= 1;
720                                 rthp = &rth->u.dst.rt_next;
721                                 continue;
722                         }
723
724                         /* Cleanup aged off entries. */
725                         *rthp = rth->u.dst.rt_next;
726                         rt_free(rth);
727                 }
728                 spin_unlock_bh(rt_hash_lock_addr(i));
729         }
730         rover = i;
731 }
732
733 /*
734  * rt_worker_func() is run in process context.
735  * If a whole flush was scheduled, it is done.
736  * Else, we call rt_check_expire() to scan part of the hash table
737  */
738 static void rt_worker_func(struct work_struct *work)
739 {
740         if (ip_rt_flush_expected) {
741                 ip_rt_flush_expected = 0;
742                 rt_do_flush(1);
743         } else
744                 rt_check_expire();
745         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
746 }
747
748 /* This can run from both BH and non-BH contexts, the latter
749  * in the case of a forced flush event.
750  */
751 static void rt_run_flush(unsigned long process_context)
752 {
753         rt_deadline = 0;
754
755         get_random_bytes(&rt_hash_rnd, 4);
756
757         rt_do_flush(process_context);
758 }
759
760 static DEFINE_SPINLOCK(rt_flush_lock);
761
762 void rt_cache_flush(int delay)
763 {
764         unsigned long now = jiffies;
765         int user_mode = !in_softirq();
766
767         if (delay < 0)
768                 delay = ip_rt_min_delay;
769
770         spin_lock_bh(&rt_flush_lock);
771
772         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
773                 long tmo = (long)(rt_deadline - now);
774
775                 /* If flush timer is already running
776                    and flush request is not immediate (delay > 0):
777
778                    if deadline is not achieved, prolongate timer to "delay",
779                    otherwise fire it at deadline time.
780                  */
781
782                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
783                         tmo = 0;
784
785                 if (delay > tmo)
786                         delay = tmo;
787         }
788
789         if (delay <= 0) {
790                 spin_unlock_bh(&rt_flush_lock);
791                 rt_run_flush(user_mode);
792                 return;
793         }
794
795         if (rt_deadline == 0)
796                 rt_deadline = now + ip_rt_max_delay;
797
798         mod_timer(&rt_flush_timer, now+delay);
799         spin_unlock_bh(&rt_flush_lock);
800 }
801
802 /*
803  * We change rt_hash_rnd and ask next rt_worker_func() invocation
804  * to perform a flush in process context
805  */
806 static void rt_secret_rebuild(unsigned long dummy)
807 {
808         get_random_bytes(&rt_hash_rnd, 4);
809         ip_rt_flush_expected = 1;
810         cancel_delayed_work(&expires_work);
811         schedule_delayed_work(&expires_work, HZ/10);
812         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
813 }
814
815 /*
816    Short description of GC goals.
817
818    We want to build algorithm, which will keep routing cache
819    at some equilibrium point, when number of aged off entries
820    is kept approximately equal to newly generated ones.
821
822    Current expiration strength is variable "expire".
823    We try to adjust it dynamically, so that if networking
824    is idle expires is large enough to keep enough of warm entries,
825    and when load increases it reduces to limit cache size.
826  */
827
828 static int rt_garbage_collect(struct dst_ops *ops)
829 {
830         static unsigned long expire = RT_GC_TIMEOUT;
831         static unsigned long last_gc;
832         static int rover;
833         static int equilibrium;
834         struct rtable *rth, **rthp;
835         unsigned long now = jiffies;
836         int goal;
837
838         /*
839          * Garbage collection is pretty expensive,
840          * do not make it too frequently.
841          */
842
843         RT_CACHE_STAT_INC(gc_total);
844
845         if (now - last_gc < ip_rt_gc_min_interval &&
846             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
847                 RT_CACHE_STAT_INC(gc_ignored);
848                 goto out;
849         }
850
851         /* Calculate number of entries, which we want to expire now. */
852         goal = atomic_read(&ipv4_dst_ops.entries) -
853                 (ip_rt_gc_elasticity << rt_hash_log);
854         if (goal <= 0) {
855                 if (equilibrium < ipv4_dst_ops.gc_thresh)
856                         equilibrium = ipv4_dst_ops.gc_thresh;
857                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
858                 if (goal > 0) {
859                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
860                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
861                 }
862         } else {
863                 /* We are in dangerous area. Try to reduce cache really
864                  * aggressively.
865                  */
866                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
867                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
868         }
869
870         if (now - last_gc >= ip_rt_gc_min_interval)
871                 last_gc = now;
872
873         if (goal <= 0) {
874                 equilibrium += goal;
875                 goto work_done;
876         }
877
878         do {
879                 int i, k;
880
881                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
882                         unsigned long tmo = expire;
883
884                         k = (k + 1) & rt_hash_mask;
885                         rthp = &rt_hash_table[k].chain;
886                         spin_lock_bh(rt_hash_lock_addr(k));
887                         while ((rth = *rthp) != NULL) {
888                                 if (!rt_may_expire(rth, tmo, expire)) {
889                                         tmo >>= 1;
890                                         rthp = &rth->u.dst.rt_next;
891                                         continue;
892                                 }
893                                 *rthp = rth->u.dst.rt_next;
894                                 rt_free(rth);
895                                 goal--;
896                         }
897                         spin_unlock_bh(rt_hash_lock_addr(k));
898                         if (goal <= 0)
899                                 break;
900                 }
901                 rover = k;
902
903                 if (goal <= 0)
904                         goto work_done;
905
906                 /* Goal is not achieved. We stop process if:
907
908                    - if expire reduced to zero. Otherwise, expire is halfed.
909                    - if table is not full.
910                    - if we are called from interrupt.
911                    - jiffies check is just fallback/debug loop breaker.
912                      We will not spin here for long time in any case.
913                  */
914
915                 RT_CACHE_STAT_INC(gc_goal_miss);
916
917                 if (expire == 0)
918                         break;
919
920                 expire >>= 1;
921 #if RT_CACHE_DEBUG >= 2
922                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
923                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
924 #endif
925
926                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927                         goto out;
928         } while (!in_softirq() && time_before_eq(jiffies, now));
929
930         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
931                 goto out;
932         if (net_ratelimit())
933                 printk(KERN_WARNING "dst cache overflow\n");
934         RT_CACHE_STAT_INC(gc_dst_overflow);
935         return 1;
936
937 work_done:
938         expire += ip_rt_gc_min_interval;
939         if (expire > ip_rt_gc_timeout ||
940             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
941                 expire = ip_rt_gc_timeout;
942 #if RT_CACHE_DEBUG >= 2
943         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
944                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
945 #endif
946 out:    return 0;
947 }
948
949 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
950 {
951         struct rtable   *rth, **rthp;
952         unsigned long   now;
953         struct rtable *cand, **candp;
954         u32             min_score;
955         int             chain_length;
956         int attempts = !in_softirq();
957
958 restart:
959         chain_length = 0;
960         min_score = ~(u32)0;
961         cand = NULL;
962         candp = NULL;
963         now = jiffies;
964
965         rthp = &rt_hash_table[hash].chain;
966
967         spin_lock_bh(rt_hash_lock_addr(hash));
968         while ((rth = *rthp) != NULL) {
969                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
970                         /* Put it first */
971                         *rthp = rth->u.dst.rt_next;
972                         /*
973                          * Since lookup is lockfree, the deletion
974                          * must be visible to another weakly ordered CPU before
975                          * the insertion at the start of the hash chain.
976                          */
977                         rcu_assign_pointer(rth->u.dst.rt_next,
978                                            rt_hash_table[hash].chain);
979                         /*
980                          * Since lookup is lockfree, the update writes
981                          * must be ordered for consistency on SMP.
982                          */
983                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
984
985                         dst_use(&rth->u.dst, now);
986                         spin_unlock_bh(rt_hash_lock_addr(hash));
987
988                         rt_drop(rt);
989                         *rp = rth;
990                         return 0;
991                 }
992
993                 if (!atomic_read(&rth->u.dst.__refcnt)) {
994                         u32 score = rt_score(rth);
995
996                         if (score <= min_score) {
997                                 cand = rth;
998                                 candp = rthp;
999                                 min_score = score;
1000                         }
1001                 }
1002
1003                 chain_length++;
1004
1005                 rthp = &rth->u.dst.rt_next;
1006         }
1007
1008         if (cand) {
1009                 /* ip_rt_gc_elasticity used to be average length of chain
1010                  * length, when exceeded gc becomes really aggressive.
1011                  *
1012                  * The second limit is less certain. At the moment it allows
1013                  * only 2 entries per bucket. We will see.
1014                  */
1015                 if (chain_length > ip_rt_gc_elasticity) {
1016                         *candp = cand->u.dst.rt_next;
1017                         rt_free(cand);
1018                 }
1019         }
1020
1021         /* Try to bind route to arp only if it is output
1022            route or unicast forwarding path.
1023          */
1024         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1025                 int err = arp_bind_neighbour(&rt->u.dst);
1026                 if (err) {
1027                         spin_unlock_bh(rt_hash_lock_addr(hash));
1028
1029                         if (err != -ENOBUFS) {
1030                                 rt_drop(rt);
1031                                 return err;
1032                         }
1033
1034                         /* Neighbour tables are full and nothing
1035                            can be released. Try to shrink route cache,
1036                            it is most likely it holds some neighbour records.
1037                          */
1038                         if (attempts-- > 0) {
1039                                 int saved_elasticity = ip_rt_gc_elasticity;
1040                                 int saved_int = ip_rt_gc_min_interval;
1041                                 ip_rt_gc_elasticity     = 1;
1042                                 ip_rt_gc_min_interval   = 0;
1043                                 rt_garbage_collect(&ipv4_dst_ops);
1044                                 ip_rt_gc_min_interval   = saved_int;
1045                                 ip_rt_gc_elasticity     = saved_elasticity;
1046                                 goto restart;
1047                         }
1048
1049                         if (net_ratelimit())
1050                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1051                         rt_drop(rt);
1052                         return -ENOBUFS;
1053                 }
1054         }
1055
1056         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1057 #if RT_CACHE_DEBUG >= 2
1058         if (rt->u.dst.rt_next) {
1059                 struct rtable *trt;
1060                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1061                        NIPQUAD(rt->rt_dst));
1062                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1063                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1064                 printk("\n");
1065         }
1066 #endif
1067         rt_hash_table[hash].chain = rt;
1068         spin_unlock_bh(rt_hash_lock_addr(hash));
1069         *rp = rt;
1070         return 0;
1071 }
1072
1073 void rt_bind_peer(struct rtable *rt, int create)
1074 {
1075         static DEFINE_SPINLOCK(rt_peer_lock);
1076         struct inet_peer *peer;
1077
1078         peer = inet_getpeer(rt->rt_dst, create);
1079
1080         spin_lock_bh(&rt_peer_lock);
1081         if (rt->peer == NULL) {
1082                 rt->peer = peer;
1083                 peer = NULL;
1084         }
1085         spin_unlock_bh(&rt_peer_lock);
1086         if (peer)
1087                 inet_putpeer(peer);
1088 }
1089
1090 /*
1091  * Peer allocation may fail only in serious out-of-memory conditions.  However
1092  * we still can generate some output.
1093  * Random ID selection looks a bit dangerous because we have no chances to
1094  * select ID being unique in a reasonable period of time.
1095  * But broken packet identifier may be better than no packet at all.
1096  */
1097 static void ip_select_fb_ident(struct iphdr *iph)
1098 {
1099         static DEFINE_SPINLOCK(ip_fb_id_lock);
1100         static u32 ip_fallback_id;
1101         u32 salt;
1102
1103         spin_lock_bh(&ip_fb_id_lock);
1104         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1105         iph->id = htons(salt & 0xFFFF);
1106         ip_fallback_id = salt;
1107         spin_unlock_bh(&ip_fb_id_lock);
1108 }
1109
1110 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1111 {
1112         struct rtable *rt = (struct rtable *) dst;
1113
1114         if (rt) {
1115                 if (rt->peer == NULL)
1116                         rt_bind_peer(rt, 1);
1117
1118                 /* If peer is attached to destination, it is never detached,
1119                    so that we need not to grab a lock to dereference it.
1120                  */
1121                 if (rt->peer) {
1122                         iph->id = htons(inet_getid(rt->peer, more));
1123                         return;
1124                 }
1125         } else
1126                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1127                        __builtin_return_address(0));
1128
1129         ip_select_fb_ident(iph);
1130 }
1131
1132 static void rt_del(unsigned hash, struct rtable *rt)
1133 {
1134         struct rtable **rthp;
1135
1136         spin_lock_bh(rt_hash_lock_addr(hash));
1137         ip_rt_put(rt);
1138         for (rthp = &rt_hash_table[hash].chain; *rthp;
1139              rthp = &(*rthp)->u.dst.rt_next)
1140                 if (*rthp == rt) {
1141                         *rthp = rt->u.dst.rt_next;
1142                         rt_free(rt);
1143                         break;
1144                 }
1145         spin_unlock_bh(rt_hash_lock_addr(hash));
1146 }
1147
1148 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1149                     __be32 saddr, struct net_device *dev)
1150 {
1151         int i, k;
1152         struct in_device *in_dev = in_dev_get(dev);
1153         struct rtable *rth, **rthp;
1154         __be32  skeys[2] = { saddr, 0 };
1155         int  ikeys[2] = { dev->ifindex, 0 };
1156         struct netevent_redirect netevent;
1157
1158         if (!in_dev)
1159                 return;
1160
1161         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1162             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1163             || ipv4_is_zeronet(new_gw))
1164                 goto reject_redirect;
1165
1166         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1167                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1168                         goto reject_redirect;
1169                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1170                         goto reject_redirect;
1171         } else {
1172                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1173                         goto reject_redirect;
1174         }
1175
1176         for (i = 0; i < 2; i++) {
1177                 for (k = 0; k < 2; k++) {
1178                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1179
1180                         rthp=&rt_hash_table[hash].chain;
1181
1182                         rcu_read_lock();
1183                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1184                                 struct rtable *rt;
1185
1186                                 if (rth->fl.fl4_dst != daddr ||
1187                                     rth->fl.fl4_src != skeys[i] ||
1188                                     rth->fl.oif != ikeys[k] ||
1189                                     rth->fl.iif != 0) {
1190                                         rthp = &rth->u.dst.rt_next;
1191                                         continue;
1192                                 }
1193
1194                                 if (rth->rt_dst != daddr ||
1195                                     rth->rt_src != saddr ||
1196                                     rth->u.dst.error ||
1197                                     rth->rt_gateway != old_gw ||
1198                                     rth->u.dst.dev != dev)
1199                                         break;
1200
1201                                 dst_hold(&rth->u.dst);
1202                                 rcu_read_unlock();
1203
1204                                 rt = dst_alloc(&ipv4_dst_ops);
1205                                 if (rt == NULL) {
1206                                         ip_rt_put(rth);
1207                                         in_dev_put(in_dev);
1208                                         return;
1209                                 }
1210
1211                                 /* Copy all the information. */
1212                                 *rt = *rth;
1213                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1214                                 rt->u.dst.__use         = 1;
1215                                 atomic_set(&rt->u.dst.__refcnt, 1);
1216                                 rt->u.dst.child         = NULL;
1217                                 if (rt->u.dst.dev)
1218                                         dev_hold(rt->u.dst.dev);
1219                                 if (rt->idev)
1220                                         in_dev_hold(rt->idev);
1221                                 rt->u.dst.obsolete      = 0;
1222                                 rt->u.dst.lastuse       = jiffies;
1223                                 rt->u.dst.path          = &rt->u.dst;
1224                                 rt->u.dst.neighbour     = NULL;
1225                                 rt->u.dst.hh            = NULL;
1226                                 rt->u.dst.xfrm          = NULL;
1227
1228                                 rt->rt_flags            |= RTCF_REDIRECTED;
1229
1230                                 /* Gateway is different ... */
1231                                 rt->rt_gateway          = new_gw;
1232
1233                                 /* Redirect received -> path was valid */
1234                                 dst_confirm(&rth->u.dst);
1235
1236                                 if (rt->peer)
1237                                         atomic_inc(&rt->peer->refcnt);
1238
1239                                 if (arp_bind_neighbour(&rt->u.dst) ||
1240                                     !(rt->u.dst.neighbour->nud_state &
1241                                             NUD_VALID)) {
1242                                         if (rt->u.dst.neighbour)
1243                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1244                                         ip_rt_put(rth);
1245                                         rt_drop(rt);
1246                                         goto do_next;
1247                                 }
1248
1249                                 netevent.old = &rth->u.dst;
1250                                 netevent.new = &rt->u.dst;
1251                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1252                                                         &netevent);
1253
1254                                 rt_del(hash, rth);
1255                                 if (!rt_intern_hash(hash, rt, &rt))
1256                                         ip_rt_put(rt);
1257                                 goto do_next;
1258                         }
1259                         rcu_read_unlock();
1260                 do_next:
1261                         ;
1262                 }
1263         }
1264         in_dev_put(in_dev);
1265         return;
1266
1267 reject_redirect:
1268 #ifdef CONFIG_IP_ROUTE_VERBOSE
1269         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1270                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1271                         "%u.%u.%u.%u ignored.\n"
1272                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1273                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1274                        NIPQUAD(saddr), NIPQUAD(daddr));
1275 #endif
1276         in_dev_put(in_dev);
1277 }
1278
1279 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1280 {
1281         struct rtable *rt = (struct rtable*)dst;
1282         struct dst_entry *ret = dst;
1283
1284         if (rt) {
1285                 if (dst->obsolete) {
1286                         ip_rt_put(rt);
1287                         ret = NULL;
1288                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1289                            rt->u.dst.expires) {
1290                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1291                                                 rt->fl.oif);
1292 #if RT_CACHE_DEBUG >= 1
1293                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1294                                           "%u.%u.%u.%u/%02x dropped\n",
1295                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1296 #endif
1297                         rt_del(hash, rt);
1298                         ret = NULL;
1299                 }
1300         }
1301         return ret;
1302 }
1303
1304 /*
1305  * Algorithm:
1306  *      1. The first ip_rt_redirect_number redirects are sent
1307  *         with exponential backoff, then we stop sending them at all,
1308  *         assuming that the host ignores our redirects.
1309  *      2. If we did not see packets requiring redirects
1310  *         during ip_rt_redirect_silence, we assume that the host
1311  *         forgot redirected route and start to send redirects again.
1312  *
1313  * This algorithm is much cheaper and more intelligent than dumb load limiting
1314  * in icmp.c.
1315  *
1316  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1317  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1318  */
1319
1320 void ip_rt_send_redirect(struct sk_buff *skb)
1321 {
1322         struct rtable *rt = (struct rtable*)skb->dst;
1323         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1324
1325         if (!in_dev)
1326                 return;
1327
1328         if (!IN_DEV_TX_REDIRECTS(in_dev))
1329                 goto out;
1330
1331         /* No redirected packets during ip_rt_redirect_silence;
1332          * reset the algorithm.
1333          */
1334         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1335                 rt->u.dst.rate_tokens = 0;
1336
1337         /* Too many ignored redirects; do not send anything
1338          * set u.dst.rate_last to the last seen redirected packet.
1339          */
1340         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1341                 rt->u.dst.rate_last = jiffies;
1342                 goto out;
1343         }
1344
1345         /* Check for load limit; set rate_last to the latest sent
1346          * redirect.
1347          */
1348         if (rt->u.dst.rate_tokens == 0 ||
1349             time_after(jiffies,
1350                        (rt->u.dst.rate_last +
1351                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1352                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1353                 rt->u.dst.rate_last = jiffies;
1354                 ++rt->u.dst.rate_tokens;
1355 #ifdef CONFIG_IP_ROUTE_VERBOSE
1356                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1357                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1358                     net_ratelimit())
1359                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1360                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1361                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1362                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1363 #endif
1364         }
1365 out:
1366         in_dev_put(in_dev);
1367 }
1368
1369 static int ip_error(struct sk_buff *skb)
1370 {
1371         struct rtable *rt = (struct rtable*)skb->dst;
1372         unsigned long now;
1373         int code;
1374
1375         switch (rt->u.dst.error) {
1376                 case EINVAL:
1377                 default:
1378                         goto out;
1379                 case EHOSTUNREACH:
1380                         code = ICMP_HOST_UNREACH;
1381                         break;
1382                 case ENETUNREACH:
1383                         code = ICMP_NET_UNREACH;
1384                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1385                         break;
1386                 case EACCES:
1387                         code = ICMP_PKT_FILTERED;
1388                         break;
1389         }
1390
1391         now = jiffies;
1392         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1393         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1394                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1395         rt->u.dst.rate_last = now;
1396         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1397                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1398                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1399         }
1400
1401 out:    kfree_skb(skb);
1402         return 0;
1403 }
1404
1405 /*
1406  *      The last two values are not from the RFC but
1407  *      are needed for AMPRnet AX.25 paths.
1408  */
1409
1410 static const unsigned short mtu_plateau[] =
1411 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1412
1413 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1414 {
1415         int i;
1416
1417         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1418                 if (old_mtu > mtu_plateau[i])
1419                         return mtu_plateau[i];
1420         return 68;
1421 }
1422
1423 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1424                                  unsigned short new_mtu)
1425 {
1426         int i;
1427         unsigned short old_mtu = ntohs(iph->tot_len);
1428         struct rtable *rth;
1429         __be32  skeys[2] = { iph->saddr, 0, };
1430         __be32  daddr = iph->daddr;
1431         unsigned short est_mtu = 0;
1432
1433         if (ipv4_config.no_pmtu_disc)
1434                 return 0;
1435
1436         for (i = 0; i < 2; i++) {
1437                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1438
1439                 rcu_read_lock();
1440                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1441                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1442                         if (rth->fl.fl4_dst == daddr &&
1443                             rth->fl.fl4_src == skeys[i] &&
1444                             rth->rt_dst  == daddr &&
1445                             rth->rt_src  == iph->saddr &&
1446                             rth->fl.iif == 0 &&
1447                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1448                             rth->u.dst.dev->nd_net == net) {
1449                                 unsigned short mtu = new_mtu;
1450
1451                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1452
1453                                         /* BSD 4.2 compatibility hack :-( */
1454                                         if (mtu == 0 &&
1455                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1456                                             old_mtu >= 68 + (iph->ihl << 2))
1457                                                 old_mtu -= iph->ihl << 2;
1458
1459                                         mtu = guess_mtu(old_mtu);
1460                                 }
1461                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1462                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1463                                                 dst_confirm(&rth->u.dst);
1464                                                 if (mtu < ip_rt_min_pmtu) {
1465                                                         mtu = ip_rt_min_pmtu;
1466                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1467                                                                 (1 << RTAX_MTU);
1468                                                 }
1469                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1470                                                 dst_set_expires(&rth->u.dst,
1471                                                         ip_rt_mtu_expires);
1472                                         }
1473                                         est_mtu = mtu;
1474                                 }
1475                         }
1476                 }
1477                 rcu_read_unlock();
1478         }
1479         return est_mtu ? : new_mtu;
1480 }
1481
1482 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1483 {
1484         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1485             !(dst_metric_locked(dst, RTAX_MTU))) {
1486                 if (mtu < ip_rt_min_pmtu) {
1487                         mtu = ip_rt_min_pmtu;
1488                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1489                 }
1490                 dst->metrics[RTAX_MTU-1] = mtu;
1491                 dst_set_expires(dst, ip_rt_mtu_expires);
1492                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1493         }
1494 }
1495
1496 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1497 {
1498         return NULL;
1499 }
1500
1501 static void ipv4_dst_destroy(struct dst_entry *dst)
1502 {
1503         struct rtable *rt = (struct rtable *) dst;
1504         struct inet_peer *peer = rt->peer;
1505         struct in_device *idev = rt->idev;
1506
1507         if (peer) {
1508                 rt->peer = NULL;
1509                 inet_putpeer(peer);
1510         }
1511
1512         if (idev) {
1513                 rt->idev = NULL;
1514                 in_dev_put(idev);
1515         }
1516 }
1517
1518 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1519                             int how)
1520 {
1521         struct rtable *rt = (struct rtable *) dst;
1522         struct in_device *idev = rt->idev;
1523         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1524                 struct in_device *loopback_idev =
1525                         in_dev_get(dev->nd_net->loopback_dev);
1526                 if (loopback_idev) {
1527                         rt->idev = loopback_idev;
1528                         in_dev_put(idev);
1529                 }
1530         }
1531 }
1532
1533 static void ipv4_link_failure(struct sk_buff *skb)
1534 {
1535         struct rtable *rt;
1536
1537         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1538
1539         rt = (struct rtable *) skb->dst;
1540         if (rt)
1541                 dst_set_expires(&rt->u.dst, 0);
1542 }
1543
1544 static int ip_rt_bug(struct sk_buff *skb)
1545 {
1546         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1547                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1548                 skb->dev ? skb->dev->name : "?");
1549         kfree_skb(skb);
1550         return 0;
1551 }
1552
1553 /*
1554    We do not cache source address of outgoing interface,
1555    because it is used only by IP RR, TS and SRR options,
1556    so that it out of fast path.
1557
1558    BTW remember: "addr" is allowed to be not aligned
1559    in IP options!
1560  */
1561
1562 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1563 {
1564         __be32 src;
1565         struct fib_result res;
1566
1567         if (rt->fl.iif == 0)
1568                 src = rt->rt_src;
1569         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1570                 src = FIB_RES_PREFSRC(res);
1571                 fib_res_put(&res);
1572         } else
1573                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1574                                         RT_SCOPE_UNIVERSE);
1575         memcpy(addr, &src, 4);
1576 }
1577
1578 #ifdef CONFIG_NET_CLS_ROUTE
1579 static void set_class_tag(struct rtable *rt, u32 tag)
1580 {
1581         if (!(rt->u.dst.tclassid & 0xFFFF))
1582                 rt->u.dst.tclassid |= tag & 0xFFFF;
1583         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1584                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1585 }
1586 #endif
1587
1588 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1589 {
1590         struct fib_info *fi = res->fi;
1591
1592         if (fi) {
1593                 if (FIB_RES_GW(*res) &&
1594                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1595                         rt->rt_gateway = FIB_RES_GW(*res);
1596                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1597                        sizeof(rt->u.dst.metrics));
1598                 if (fi->fib_mtu == 0) {
1599                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1600                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1601                             rt->rt_gateway != rt->rt_dst &&
1602                             rt->u.dst.dev->mtu > 576)
1603                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1604                 }
1605 #ifdef CONFIG_NET_CLS_ROUTE
1606                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1607 #endif
1608         } else
1609                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1610
1611         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1612                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1613         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1614                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1615         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1616                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1617                                        ip_rt_min_advmss);
1618         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1619                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1620
1621 #ifdef CONFIG_NET_CLS_ROUTE
1622 #ifdef CONFIG_IP_MULTIPLE_TABLES
1623         set_class_tag(rt, fib_rules_tclass(res));
1624 #endif
1625         set_class_tag(rt, itag);
1626 #endif
1627         rt->rt_type = res->type;
1628 }
1629
1630 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1631                                 u8 tos, struct net_device *dev, int our)
1632 {
1633         unsigned hash;
1634         struct rtable *rth;
1635         __be32 spec_dst;
1636         struct in_device *in_dev = in_dev_get(dev);
1637         u32 itag = 0;
1638
1639         /* Primary sanity checks. */
1640
1641         if (in_dev == NULL)
1642                 return -EINVAL;
1643
1644         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1645             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1646                 goto e_inval;
1647
1648         if (ipv4_is_zeronet(saddr)) {
1649                 if (!ipv4_is_local_multicast(daddr))
1650                         goto e_inval;
1651                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1652         } else if (fib_validate_source(saddr, 0, tos, 0,
1653                                         dev, &spec_dst, &itag) < 0)
1654                 goto e_inval;
1655
1656         rth = dst_alloc(&ipv4_dst_ops);
1657         if (!rth)
1658                 goto e_nobufs;
1659
1660         rth->u.dst.output= ip_rt_bug;
1661
1662         atomic_set(&rth->u.dst.__refcnt, 1);
1663         rth->u.dst.flags= DST_HOST;
1664         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1665                 rth->u.dst.flags |= DST_NOPOLICY;
1666         rth->fl.fl4_dst = daddr;
1667         rth->rt_dst     = daddr;
1668         rth->fl.fl4_tos = tos;
1669         rth->fl.mark    = skb->mark;
1670         rth->fl.fl4_src = saddr;
1671         rth->rt_src     = saddr;
1672 #ifdef CONFIG_NET_CLS_ROUTE
1673         rth->u.dst.tclassid = itag;
1674 #endif
1675         rth->rt_iif     =
1676         rth->fl.iif     = dev->ifindex;
1677         rth->u.dst.dev  = init_net.loopback_dev;
1678         dev_hold(rth->u.dst.dev);
1679         rth->idev       = in_dev_get(rth->u.dst.dev);
1680         rth->fl.oif     = 0;
1681         rth->rt_gateway = daddr;
1682         rth->rt_spec_dst= spec_dst;
1683         rth->rt_type    = RTN_MULTICAST;
1684         rth->rt_flags   = RTCF_MULTICAST;
1685         if (our) {
1686                 rth->u.dst.input= ip_local_deliver;
1687                 rth->rt_flags |= RTCF_LOCAL;
1688         }
1689
1690 #ifdef CONFIG_IP_MROUTE
1691         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1692                 rth->u.dst.input = ip_mr_input;
1693 #endif
1694         RT_CACHE_STAT_INC(in_slow_mc);
1695
1696         in_dev_put(in_dev);
1697         hash = rt_hash(daddr, saddr, dev->ifindex);
1698         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1699
1700 e_nobufs:
1701         in_dev_put(in_dev);
1702         return -ENOBUFS;
1703
1704 e_inval:
1705         in_dev_put(in_dev);
1706         return -EINVAL;
1707 }
1708
1709
1710 static void ip_handle_martian_source(struct net_device *dev,
1711                                      struct in_device *in_dev,
1712                                      struct sk_buff *skb,
1713                                      __be32 daddr,
1714                                      __be32 saddr)
1715 {
1716         RT_CACHE_STAT_INC(in_martian_src);
1717 #ifdef CONFIG_IP_ROUTE_VERBOSE
1718         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1719                 /*
1720                  *      RFC1812 recommendation, if source is martian,
1721                  *      the only hint is MAC header.
1722                  */
1723                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1724                         "%u.%u.%u.%u, on dev %s\n",
1725                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1726                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1727                         int i;
1728                         const unsigned char *p = skb_mac_header(skb);
1729                         printk(KERN_WARNING "ll header: ");
1730                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1731                                 printk("%02x", *p);
1732                                 if (i < (dev->hard_header_len - 1))
1733                                         printk(":");
1734                         }
1735                         printk("\n");
1736                 }
1737         }
1738 #endif
1739 }
1740
1741 static inline int __mkroute_input(struct sk_buff *skb,
1742                                   struct fib_result* res,
1743                                   struct in_device *in_dev,
1744                                   __be32 daddr, __be32 saddr, u32 tos,
1745                                   struct rtable **result)
1746 {
1747
1748         struct rtable *rth;
1749         int err;
1750         struct in_device *out_dev;
1751         unsigned flags = 0;
1752         __be32 spec_dst;
1753         u32 itag;
1754
1755         /* get a working reference to the output device */
1756         out_dev = in_dev_get(FIB_RES_DEV(*res));
1757         if (out_dev == NULL) {
1758                 if (net_ratelimit())
1759                         printk(KERN_CRIT "Bug in ip_route_input" \
1760                                "_slow(). Please, report\n");
1761                 return -EINVAL;
1762         }
1763
1764
1765         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1766                                   in_dev->dev, &spec_dst, &itag);
1767         if (err < 0) {
1768                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1769                                          saddr);
1770
1771                 err = -EINVAL;
1772                 goto cleanup;
1773         }
1774
1775         if (err)
1776                 flags |= RTCF_DIRECTSRC;
1777
1778         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1779             (IN_DEV_SHARED_MEDIA(out_dev) ||
1780              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1781                 flags |= RTCF_DOREDIRECT;
1782
1783         if (skb->protocol != htons(ETH_P_IP)) {
1784                 /* Not IP (i.e. ARP). Do not create route, if it is
1785                  * invalid for proxy arp. DNAT routes are always valid.
1786                  */
1787                 if (out_dev == in_dev) {
1788                         err = -EINVAL;
1789                         goto cleanup;
1790                 }
1791         }
1792
1793
1794         rth = dst_alloc(&ipv4_dst_ops);
1795         if (!rth) {
1796                 err = -ENOBUFS;
1797                 goto cleanup;
1798         }
1799
1800         atomic_set(&rth->u.dst.__refcnt, 1);
1801         rth->u.dst.flags= DST_HOST;
1802         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1803                 rth->u.dst.flags |= DST_NOPOLICY;
1804         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1805                 rth->u.dst.flags |= DST_NOXFRM;
1806         rth->fl.fl4_dst = daddr;
1807         rth->rt_dst     = daddr;
1808         rth->fl.fl4_tos = tos;
1809         rth->fl.mark    = skb->mark;
1810         rth->fl.fl4_src = saddr;
1811         rth->rt_src     = saddr;
1812         rth->rt_gateway = daddr;
1813         rth->rt_iif     =
1814                 rth->fl.iif     = in_dev->dev->ifindex;
1815         rth->u.dst.dev  = (out_dev)->dev;
1816         dev_hold(rth->u.dst.dev);
1817         rth->idev       = in_dev_get(rth->u.dst.dev);
1818         rth->fl.oif     = 0;
1819         rth->rt_spec_dst= spec_dst;
1820
1821         rth->u.dst.input = ip_forward;
1822         rth->u.dst.output = ip_output;
1823
1824         rt_set_nexthop(rth, res, itag);
1825
1826         rth->rt_flags = flags;
1827
1828         *result = rth;
1829         err = 0;
1830  cleanup:
1831         /* release the working reference to the output device */
1832         in_dev_put(out_dev);
1833         return err;
1834 }
1835
1836 static inline int ip_mkroute_input(struct sk_buff *skb,
1837                                    struct fib_result* res,
1838                                    const struct flowi *fl,
1839                                    struct in_device *in_dev,
1840                                    __be32 daddr, __be32 saddr, u32 tos)
1841 {
1842         struct rtable* rth = NULL;
1843         int err;
1844         unsigned hash;
1845
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1847         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1848                 fib_select_multipath(fl, res);
1849 #endif
1850
1851         /* create a routing cache entry */
1852         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1853         if (err)
1854                 return err;
1855
1856         /* put it into the cache */
1857         hash = rt_hash(daddr, saddr, fl->iif);
1858         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1859 }
1860
1861 /*
1862  *      NOTE. We drop all the packets that has local source
1863  *      addresses, because every properly looped back packet
1864  *      must have correct destination already attached by output routine.
1865  *
1866  *      Such approach solves two big problems:
1867  *      1. Not simplex devices are handled properly.
1868  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1869  */
1870
1871 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1872                                u8 tos, struct net_device *dev)
1873 {
1874         struct fib_result res;
1875         struct in_device *in_dev = in_dev_get(dev);
1876         struct flowi fl = { .nl_u = { .ip4_u =
1877                                       { .daddr = daddr,
1878                                         .saddr = saddr,
1879                                         .tos = tos,
1880                                         .scope = RT_SCOPE_UNIVERSE,
1881                                       } },
1882                             .mark = skb->mark,
1883                             .iif = dev->ifindex };
1884         unsigned        flags = 0;
1885         u32             itag = 0;
1886         struct rtable * rth;
1887         unsigned        hash;
1888         __be32          spec_dst;
1889         int             err = -EINVAL;
1890         int             free_res = 0;
1891         struct net    * net = dev->nd_net;
1892
1893         /* IP on this device is disabled. */
1894
1895         if (!in_dev)
1896                 goto out;
1897
1898         /* Check for the most weird martians, which can be not detected
1899            by fib_lookup.
1900          */
1901
1902         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1903             ipv4_is_loopback(saddr))
1904                 goto martian_source;
1905
1906         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1907                 goto brd_input;
1908
1909         /* Accept zero addresses only to limited broadcast;
1910          * I even do not know to fix it or not. Waiting for complains :-)
1911          */
1912         if (ipv4_is_zeronet(saddr))
1913                 goto martian_source;
1914
1915         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1916             ipv4_is_loopback(daddr))
1917                 goto martian_destination;
1918
1919         /*
1920          *      Now we are ready to route packet.
1921          */
1922         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1923                 if (!IN_DEV_FORWARD(in_dev))
1924                         goto e_hostunreach;
1925                 goto no_route;
1926         }
1927         free_res = 1;
1928
1929         RT_CACHE_STAT_INC(in_slow_tot);
1930
1931         if (res.type == RTN_BROADCAST)
1932                 goto brd_input;
1933
1934         if (res.type == RTN_LOCAL) {
1935                 int result;
1936                 result = fib_validate_source(saddr, daddr, tos,
1937                                              net->loopback_dev->ifindex,
1938                                              dev, &spec_dst, &itag);
1939                 if (result < 0)
1940                         goto martian_source;
1941                 if (result)
1942                         flags |= RTCF_DIRECTSRC;
1943                 spec_dst = daddr;
1944                 goto local_input;
1945         }
1946
1947         if (!IN_DEV_FORWARD(in_dev))
1948                 goto e_hostunreach;
1949         if (res.type != RTN_UNICAST)
1950                 goto martian_destination;
1951
1952         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1953 done:
1954         in_dev_put(in_dev);
1955         if (free_res)
1956                 fib_res_put(&res);
1957 out:    return err;
1958
1959 brd_input:
1960         if (skb->protocol != htons(ETH_P_IP))
1961                 goto e_inval;
1962
1963         if (ipv4_is_zeronet(saddr))
1964                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1965         else {
1966                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1967                                           &itag);
1968                 if (err < 0)
1969                         goto martian_source;
1970                 if (err)
1971                         flags |= RTCF_DIRECTSRC;
1972         }
1973         flags |= RTCF_BROADCAST;
1974         res.type = RTN_BROADCAST;
1975         RT_CACHE_STAT_INC(in_brd);
1976
1977 local_input:
1978         rth = dst_alloc(&ipv4_dst_ops);
1979         if (!rth)
1980                 goto e_nobufs;
1981
1982         rth->u.dst.output= ip_rt_bug;
1983
1984         atomic_set(&rth->u.dst.__refcnt, 1);
1985         rth->u.dst.flags= DST_HOST;
1986         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1987                 rth->u.dst.flags |= DST_NOPOLICY;
1988         rth->fl.fl4_dst = daddr;
1989         rth->rt_dst     = daddr;
1990         rth->fl.fl4_tos = tos;
1991         rth->fl.mark    = skb->mark;
1992         rth->fl.fl4_src = saddr;
1993         rth->rt_src     = saddr;
1994 #ifdef CONFIG_NET_CLS_ROUTE
1995         rth->u.dst.tclassid = itag;
1996 #endif
1997         rth->rt_iif     =
1998         rth->fl.iif     = dev->ifindex;
1999         rth->u.dst.dev  = net->loopback_dev;
2000         dev_hold(rth->u.dst.dev);
2001         rth->idev       = in_dev_get(rth->u.dst.dev);
2002         rth->rt_gateway = daddr;
2003         rth->rt_spec_dst= spec_dst;
2004         rth->u.dst.input= ip_local_deliver;
2005         rth->rt_flags   = flags|RTCF_LOCAL;
2006         if (res.type == RTN_UNREACHABLE) {
2007                 rth->u.dst.input= ip_error;
2008                 rth->u.dst.error= -err;
2009                 rth->rt_flags   &= ~RTCF_LOCAL;
2010         }
2011         rth->rt_type    = res.type;
2012         hash = rt_hash(daddr, saddr, fl.iif);
2013         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2014         goto done;
2015
2016 no_route:
2017         RT_CACHE_STAT_INC(in_no_route);
2018         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2019         res.type = RTN_UNREACHABLE;
2020         if (err == -ESRCH)
2021                 err = -ENETUNREACH;
2022         goto local_input;
2023
2024         /*
2025          *      Do not cache martian addresses: they should be logged (RFC1812)
2026          */
2027 martian_destination:
2028         RT_CACHE_STAT_INC(in_martian_dst);
2029 #ifdef CONFIG_IP_ROUTE_VERBOSE
2030         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2031                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2032                         "%u.%u.%u.%u, dev %s\n",
2033                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2034 #endif
2035
2036 e_hostunreach:
2037         err = -EHOSTUNREACH;
2038         goto done;
2039
2040 e_inval:
2041         err = -EINVAL;
2042         goto done;
2043
2044 e_nobufs:
2045         err = -ENOBUFS;
2046         goto done;
2047
2048 martian_source:
2049         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2050         goto e_inval;
2051 }
2052
2053 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2054                    u8 tos, struct net_device *dev)
2055 {
2056         struct rtable * rth;
2057         unsigned        hash;
2058         int iif = dev->ifindex;
2059         struct net *net;
2060
2061         net = skb->dev->nd_net;
2062         tos &= IPTOS_RT_MASK;
2063         hash = rt_hash(daddr, saddr, iif);
2064
2065         rcu_read_lock();
2066         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2067              rth = rcu_dereference(rth->u.dst.rt_next)) {
2068                 if (rth->fl.fl4_dst == daddr &&
2069                     rth->fl.fl4_src == saddr &&
2070                     rth->fl.iif == iif &&
2071                     rth->fl.oif == 0 &&
2072                     rth->fl.mark == skb->mark &&
2073                     rth->fl.fl4_tos == tos &&
2074                     rth->u.dst.dev->nd_net == net) {
2075                         dst_use(&rth->u.dst, jiffies);
2076                         RT_CACHE_STAT_INC(in_hit);
2077                         rcu_read_unlock();
2078                         skb->dst = (struct dst_entry*)rth;
2079                         return 0;
2080                 }
2081                 RT_CACHE_STAT_INC(in_hlist_search);
2082         }
2083         rcu_read_unlock();
2084
2085         /* Multicast recognition logic is moved from route cache to here.
2086            The problem was that too many Ethernet cards have broken/missing
2087            hardware multicast filters :-( As result the host on multicasting
2088            network acquires a lot of useless route cache entries, sort of
2089            SDR messages from all the world. Now we try to get rid of them.
2090            Really, provided software IP multicast filter is organized
2091            reasonably (at least, hashed), it does not result in a slowdown
2092            comparing with route cache reject entries.
2093            Note, that multicast routers are not affected, because
2094            route cache entry is created eventually.
2095          */
2096         if (ipv4_is_multicast(daddr)) {
2097                 struct in_device *in_dev;
2098
2099                 rcu_read_lock();
2100                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2101                         int our = ip_check_mc(in_dev, daddr, saddr,
2102                                 ip_hdr(skb)->protocol);
2103                         if (our
2104 #ifdef CONFIG_IP_MROUTE
2105                             || (!ipv4_is_local_multicast(daddr) &&
2106                                 IN_DEV_MFORWARD(in_dev))
2107 #endif
2108                             ) {
2109                                 rcu_read_unlock();
2110                                 return ip_route_input_mc(skb, daddr, saddr,
2111                                                          tos, dev, our);
2112                         }
2113                 }
2114                 rcu_read_unlock();
2115                 return -EINVAL;
2116         }
2117         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2118 }
2119
2120 static inline int __mkroute_output(struct rtable **result,
2121                                    struct fib_result* res,
2122                                    const struct flowi *fl,
2123                                    const struct flowi *oldflp,
2124                                    struct net_device *dev_out,
2125                                    unsigned flags)
2126 {
2127         struct rtable *rth;
2128         struct in_device *in_dev;
2129         u32 tos = RT_FL_TOS(oldflp);
2130         int err = 0;
2131
2132         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2133                 return -EINVAL;
2134
2135         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2136                 res->type = RTN_BROADCAST;
2137         else if (ipv4_is_multicast(fl->fl4_dst))
2138                 res->type = RTN_MULTICAST;
2139         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2140                 return -EINVAL;
2141
2142         if (dev_out->flags & IFF_LOOPBACK)
2143                 flags |= RTCF_LOCAL;
2144
2145         /* get work reference to inet device */
2146         in_dev = in_dev_get(dev_out);
2147         if (!in_dev)
2148                 return -EINVAL;
2149
2150         if (res->type == RTN_BROADCAST) {
2151                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2152                 if (res->fi) {
2153                         fib_info_put(res->fi);
2154                         res->fi = NULL;
2155                 }
2156         } else if (res->type == RTN_MULTICAST) {
2157                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2158                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2159                                  oldflp->proto))
2160                         flags &= ~RTCF_LOCAL;
2161                 /* If multicast route do not exist use
2162                    default one, but do not gateway in this case.
2163                    Yes, it is hack.
2164                  */
2165                 if (res->fi && res->prefixlen < 4) {
2166                         fib_info_put(res->fi);
2167                         res->fi = NULL;
2168                 }
2169         }
2170
2171
2172         rth = dst_alloc(&ipv4_dst_ops);
2173         if (!rth) {
2174                 err = -ENOBUFS;
2175                 goto cleanup;
2176         }
2177
2178         atomic_set(&rth->u.dst.__refcnt, 1);
2179         rth->u.dst.flags= DST_HOST;
2180         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2181                 rth->u.dst.flags |= DST_NOXFRM;
2182         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2183                 rth->u.dst.flags |= DST_NOPOLICY;
2184
2185         rth->fl.fl4_dst = oldflp->fl4_dst;
2186         rth->fl.fl4_tos = tos;
2187         rth->fl.fl4_src = oldflp->fl4_src;
2188         rth->fl.oif     = oldflp->oif;
2189         rth->fl.mark    = oldflp->mark;
2190         rth->rt_dst     = fl->fl4_dst;
2191         rth->rt_src     = fl->fl4_src;
2192         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2193         /* get references to the devices that are to be hold by the routing
2194            cache entry */
2195         rth->u.dst.dev  = dev_out;
2196         dev_hold(dev_out);
2197         rth->idev       = in_dev_get(dev_out);
2198         rth->rt_gateway = fl->fl4_dst;
2199         rth->rt_spec_dst= fl->fl4_src;
2200
2201         rth->u.dst.output=ip_output;
2202
2203         RT_CACHE_STAT_INC(out_slow_tot);
2204
2205         if (flags & RTCF_LOCAL) {
2206                 rth->u.dst.input = ip_local_deliver;
2207                 rth->rt_spec_dst = fl->fl4_dst;
2208         }
2209         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2210                 rth->rt_spec_dst = fl->fl4_src;
2211                 if (flags & RTCF_LOCAL &&
2212                     !(dev_out->flags & IFF_LOOPBACK)) {
2213                         rth->u.dst.output = ip_mc_output;
2214                         RT_CACHE_STAT_INC(out_slow_mc);
2215                 }
2216 #ifdef CONFIG_IP_MROUTE
2217                 if (res->type == RTN_MULTICAST) {
2218                         if (IN_DEV_MFORWARD(in_dev) &&
2219                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2220                                 rth->u.dst.input = ip_mr_input;
2221                                 rth->u.dst.output = ip_mc_output;
2222                         }
2223                 }
2224 #endif
2225         }
2226
2227         rt_set_nexthop(rth, res, 0);
2228
2229         rth->rt_flags = flags;
2230
2231         *result = rth;
2232  cleanup:
2233         /* release work reference to inet device */
2234         in_dev_put(in_dev);
2235
2236         return err;
2237 }
2238
2239 static inline int ip_mkroute_output(struct rtable **rp,
2240                                     struct fib_result* res,
2241                                     const struct flowi *fl,
2242                                     const struct flowi *oldflp,
2243                                     struct net_device *dev_out,
2244                                     unsigned flags)
2245 {
2246         struct rtable *rth = NULL;
2247         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2248         unsigned hash;
2249         if (err == 0) {
2250                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2251                 err = rt_intern_hash(hash, rth, rp);
2252         }
2253
2254         return err;
2255 }
2256
2257 /*
2258  * Major route resolver routine.
2259  */
2260
2261 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2262                                 const struct flowi *oldflp)
2263 {
2264         u32 tos = RT_FL_TOS(oldflp);
2265         struct flowi fl = { .nl_u = { .ip4_u =
2266                                       { .daddr = oldflp->fl4_dst,
2267                                         .saddr = oldflp->fl4_src,
2268                                         .tos = tos & IPTOS_RT_MASK,
2269                                         .scope = ((tos & RTO_ONLINK) ?
2270                                                   RT_SCOPE_LINK :
2271                                                   RT_SCOPE_UNIVERSE),
2272                                       } },
2273                             .mark = oldflp->mark,
2274                             .iif = net->loopback_dev->ifindex,
2275                             .oif = oldflp->oif };
2276         struct fib_result res;
2277         unsigned flags = 0;
2278         struct net_device *dev_out = NULL;
2279         int free_res = 0;
2280         int err;
2281
2282
2283         res.fi          = NULL;
2284 #ifdef CONFIG_IP_MULTIPLE_TABLES
2285         res.r           = NULL;
2286 #endif
2287
2288         if (oldflp->fl4_src) {
2289                 err = -EINVAL;
2290                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2291                     ipv4_is_lbcast(oldflp->fl4_src) ||
2292                     ipv4_is_zeronet(oldflp->fl4_src))
2293                         goto out;
2294
2295                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2296                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2297                 if (dev_out == NULL)
2298                         goto out;
2299
2300                 /* I removed check for oif == dev_out->oif here.
2301                    It was wrong for two reasons:
2302                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2303                       is assigned to multiple interfaces.
2304                    2. Moreover, we are allowed to send packets with saddr
2305                       of another iface. --ANK
2306                  */
2307
2308                 if (oldflp->oif == 0
2309                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2310                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2311                         /* Special hack: user can direct multicasts
2312                            and limited broadcast via necessary interface
2313                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2314                            This hack is not just for fun, it allows
2315                            vic,vat and friends to work.
2316                            They bind socket to loopback, set ttl to zero
2317                            and expect that it will work.
2318                            From the viewpoint of routing cache they are broken,
2319                            because we are not allowed to build multicast path
2320                            with loopback source addr (look, routing cache
2321                            cannot know, that ttl is zero, so that packet
2322                            will not leave this host and route is valid).
2323                            Luckily, this hack is good workaround.
2324                          */
2325
2326                         fl.oif = dev_out->ifindex;
2327                         goto make_route;
2328                 }
2329                 if (dev_out)
2330                         dev_put(dev_out);
2331                 dev_out = NULL;
2332         }
2333
2334
2335         if (oldflp->oif) {
2336                 dev_out = dev_get_by_index(net, oldflp->oif);
2337                 err = -ENODEV;
2338                 if (dev_out == NULL)
2339                         goto out;
2340
2341                 /* RACE: Check return value of inet_select_addr instead. */
2342                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2343                         dev_put(dev_out);
2344                         goto out;       /* Wrong error code */
2345                 }
2346
2347                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2348                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2349                         if (!fl.fl4_src)
2350                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2351                                                               RT_SCOPE_LINK);
2352                         goto make_route;
2353                 }
2354                 if (!fl.fl4_src) {
2355                         if (ipv4_is_multicast(oldflp->fl4_dst))
2356                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2357                                                               fl.fl4_scope);
2358                         else if (!oldflp->fl4_dst)
2359                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2360                                                               RT_SCOPE_HOST);
2361                 }
2362         }
2363
2364         if (!fl.fl4_dst) {
2365                 fl.fl4_dst = fl.fl4_src;
2366                 if (!fl.fl4_dst)
2367                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2368                 if (dev_out)
2369                         dev_put(dev_out);
2370                 dev_out = net->loopback_dev;
2371                 dev_hold(dev_out);
2372                 fl.oif = net->loopback_dev->ifindex;
2373                 res.type = RTN_LOCAL;
2374                 flags |= RTCF_LOCAL;
2375                 goto make_route;
2376         }
2377
2378         if (fib_lookup(net, &fl, &res)) {
2379                 res.fi = NULL;
2380                 if (oldflp->oif) {
2381                         /* Apparently, routing tables are wrong. Assume,
2382                            that the destination is on link.
2383
2384                            WHY? DW.
2385                            Because we are allowed to send to iface
2386                            even if it has NO routes and NO assigned
2387                            addresses. When oif is specified, routing
2388                            tables are looked up with only one purpose:
2389                            to catch if destination is gatewayed, rather than
2390                            direct. Moreover, if MSG_DONTROUTE is set,
2391                            we send packet, ignoring both routing tables
2392                            and ifaddr state. --ANK
2393
2394
2395                            We could make it even if oif is unknown,
2396                            likely IPv6, but we do not.
2397                          */
2398
2399                         if (fl.fl4_src == 0)
2400                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2401                                                               RT_SCOPE_LINK);
2402                         res.type = RTN_UNICAST;
2403                         goto make_route;
2404                 }
2405                 if (dev_out)
2406                         dev_put(dev_out);
2407                 err = -ENETUNREACH;
2408                 goto out;
2409         }
2410         free_res = 1;
2411
2412         if (res.type == RTN_LOCAL) {
2413                 if (!fl.fl4_src)
2414                         fl.fl4_src = fl.fl4_dst;
2415                 if (dev_out)
2416                         dev_put(dev_out);
2417                 dev_out = net->loopback_dev;
2418                 dev_hold(dev_out);
2419                 fl.oif = dev_out->ifindex;
2420                 if (res.fi)
2421                         fib_info_put(res.fi);
2422                 res.fi = NULL;
2423                 flags |= RTCF_LOCAL;
2424                 goto make_route;
2425         }
2426
2427 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2428         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2429                 fib_select_multipath(&fl, &res);
2430         else
2431 #endif
2432         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2433                 fib_select_default(net, &fl, &res);
2434
2435         if (!fl.fl4_src)
2436                 fl.fl4_src = FIB_RES_PREFSRC(res);
2437
2438         if (dev_out)
2439                 dev_put(dev_out);
2440         dev_out = FIB_RES_DEV(res);
2441         dev_hold(dev_out);
2442         fl.oif = dev_out->ifindex;
2443
2444
2445 make_route:
2446         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2447
2448
2449         if (free_res)
2450                 fib_res_put(&res);
2451         if (dev_out)
2452                 dev_put(dev_out);
2453 out:    return err;
2454 }
2455
2456 int __ip_route_output_key(struct net *net, struct rtable **rp,
2457                           const struct flowi *flp)
2458 {
2459         unsigned hash;
2460         struct rtable *rth;
2461
2462         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2463
2464         rcu_read_lock_bh();
2465         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2466                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2467                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2468                     rth->fl.fl4_src == flp->fl4_src &&
2469                     rth->fl.iif == 0 &&
2470                     rth->fl.oif == flp->oif &&
2471                     rth->fl.mark == flp->mark &&
2472                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2473                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2474                     rth->u.dst.dev->nd_net == net) {
2475                         dst_use(&rth->u.dst, jiffies);
2476                         RT_CACHE_STAT_INC(out_hit);
2477                         rcu_read_unlock_bh();
2478                         *rp = rth;
2479                         return 0;
2480                 }
2481                 RT_CACHE_STAT_INC(out_hlist_search);
2482         }
2483         rcu_read_unlock_bh();
2484
2485         return ip_route_output_slow(net, rp, flp);
2486 }
2487
2488 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2489
2490 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2491 {
2492 }
2493
2494 static struct dst_ops ipv4_dst_blackhole_ops = {
2495         .family                 =       AF_INET,
2496         .protocol               =       __constant_htons(ETH_P_IP),
2497         .destroy                =       ipv4_dst_destroy,
2498         .check                  =       ipv4_dst_check,
2499         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2500         .entry_size             =       sizeof(struct rtable),
2501 };
2502
2503
2504 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2505 {
2506         struct rtable *ort = *rp;
2507         struct rtable *rt = (struct rtable *)
2508                 dst_alloc(&ipv4_dst_blackhole_ops);
2509
2510         if (rt) {
2511                 struct dst_entry *new = &rt->u.dst;
2512
2513                 atomic_set(&new->__refcnt, 1);
2514                 new->__use = 1;
2515                 new->input = dst_discard;
2516                 new->output = dst_discard;
2517                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2518
2519                 new->dev = ort->u.dst.dev;
2520                 if (new->dev)
2521                         dev_hold(new->dev);
2522
2523                 rt->fl = ort->fl;
2524
2525                 rt->idev = ort->idev;
2526                 if (rt->idev)
2527                         in_dev_hold(rt->idev);
2528                 rt->rt_flags = ort->rt_flags;
2529                 rt->rt_type = ort->rt_type;
2530                 rt->rt_dst = ort->rt_dst;
2531                 rt->rt_src = ort->rt_src;
2532                 rt->rt_iif = ort->rt_iif;
2533                 rt->rt_gateway = ort->rt_gateway;
2534                 rt->rt_spec_dst = ort->rt_spec_dst;
2535                 rt->peer = ort->peer;
2536                 if (rt->peer)
2537                         atomic_inc(&rt->peer->refcnt);
2538
2539                 dst_free(new);
2540         }
2541
2542         dst_release(&(*rp)->u.dst);
2543         *rp = rt;
2544         return (rt ? 0 : -ENOMEM);
2545 }
2546
2547 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2548                          struct sock *sk, int flags)
2549 {
2550         int err;
2551
2552         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2553                 return err;
2554
2555         if (flp->proto) {
2556                 if (!flp->fl4_src)
2557                         flp->fl4_src = (*rp)->rt_src;
2558                 if (!flp->fl4_dst)
2559                         flp->fl4_dst = (*rp)->rt_dst;
2560                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2561                                     flags ? XFRM_LOOKUP_WAIT : 0);
2562                 if (err == -EREMOTE)
2563                         err = ipv4_dst_blackhole(rp, flp, sk);
2564
2565                 return err;
2566         }
2567
2568         return 0;
2569 }
2570
2571 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2572
2573 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2574 {
2575         return ip_route_output_flow(net, rp, flp, NULL, 0);
2576 }
2577
2578 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2579                         int nowait, unsigned int flags)
2580 {
2581         struct rtable *rt = (struct rtable*)skb->dst;
2582         struct rtmsg *r;
2583         struct nlmsghdr *nlh;
2584         long expires;
2585         u32 id = 0, ts = 0, tsage = 0, error;
2586
2587         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2588         if (nlh == NULL)
2589                 return -EMSGSIZE;
2590
2591         r = nlmsg_data(nlh);
2592         r->rtm_family    = AF_INET;
2593         r->rtm_dst_len  = 32;
2594         r->rtm_src_len  = 0;
2595         r->rtm_tos      = rt->fl.fl4_tos;
2596         r->rtm_table    = RT_TABLE_MAIN;
2597         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2598         r->rtm_type     = rt->rt_type;
2599         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2600         r->rtm_protocol = RTPROT_UNSPEC;
2601         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2602         if (rt->rt_flags & RTCF_NOTIFY)
2603                 r->rtm_flags |= RTM_F_NOTIFY;
2604
2605         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2606
2607         if (rt->fl.fl4_src) {
2608                 r->rtm_src_len = 32;
2609                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2610         }
2611         if (rt->u.dst.dev)
2612                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2613 #ifdef CONFIG_NET_CLS_ROUTE
2614         if (rt->u.dst.tclassid)
2615                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2616 #endif
2617         if (rt->fl.iif)
2618                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2619         else if (rt->rt_src != rt->fl.fl4_src)
2620                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2621
2622         if (rt->rt_dst != rt->rt_gateway)
2623                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2624
2625         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2626                 goto nla_put_failure;
2627
2628         error = rt->u.dst.error;
2629         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2630         if (rt->peer) {
2631                 id = rt->peer->ip_id_count;
2632                 if (rt->peer->tcp_ts_stamp) {
2633                         ts = rt->peer->tcp_ts;
2634                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2635                 }
2636         }
2637
2638         if (rt->fl.iif) {
2639 #ifdef CONFIG_IP_MROUTE
2640                 __be32 dst = rt->rt_dst;
2641
2642                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2643                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2644                         int err = ipmr_get_route(skb, r, nowait);
2645                         if (err <= 0) {
2646                                 if (!nowait) {
2647                                         if (err == 0)
2648                                                 return 0;
2649                                         goto nla_put_failure;
2650                                 } else {
2651                                         if (err == -EMSGSIZE)
2652                                                 goto nla_put_failure;
2653                                         error = err;
2654                                 }
2655                         }
2656                 } else
2657 #endif
2658                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2659         }
2660
2661         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2662                                expires, error) < 0)
2663                 goto nla_put_failure;
2664
2665         return nlmsg_end(skb, nlh);
2666
2667 nla_put_failure:
2668         nlmsg_cancel(skb, nlh);
2669         return -EMSGSIZE;
2670 }
2671
2672 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2673 {
2674         struct net *net = in_skb->sk->sk_net;
2675         struct rtmsg *rtm;
2676         struct nlattr *tb[RTA_MAX+1];
2677         struct rtable *rt = NULL;
2678         __be32 dst = 0;
2679         __be32 src = 0;
2680         u32 iif;
2681         int err;
2682         struct sk_buff *skb;
2683
2684         if (net != &init_net)
2685                 return -EINVAL;
2686
2687         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2688         if (err < 0)
2689                 goto errout;
2690
2691         rtm = nlmsg_data(nlh);
2692
2693         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2694         if (skb == NULL) {
2695                 err = -ENOBUFS;
2696                 goto errout;
2697         }
2698
2699         /* Reserve room for dummy headers, this skb can pass
2700            through good chunk of routing engine.
2701          */
2702         skb_reset_mac_header(skb);
2703         skb_reset_network_header(skb);
2704
2705         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2706         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2707         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2708
2709         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2710         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2711         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2712
2713         if (iif) {
2714                 struct net_device *dev;
2715
2716                 dev = __dev_get_by_index(&init_net, iif);
2717                 if (dev == NULL) {
2718                         err = -ENODEV;
2719                         goto errout_free;
2720                 }
2721
2722                 skb->protocol   = htons(ETH_P_IP);
2723                 skb->dev        = dev;
2724                 local_bh_disable();
2725                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2726                 local_bh_enable();
2727
2728                 rt = (struct rtable*) skb->dst;
2729                 if (err == 0 && rt->u.dst.error)
2730                         err = -rt->u.dst.error;
2731         } else {
2732                 struct flowi fl = {
2733                         .nl_u = {
2734                                 .ip4_u = {
2735                                         .daddr = dst,
2736                                         .saddr = src,
2737                                         .tos = rtm->rtm_tos,
2738                                 },
2739                         },
2740                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2741                 };
2742                 err = ip_route_output_key(&init_net, &rt, &fl);
2743         }
2744
2745         if (err)
2746                 goto errout_free;
2747
2748         skb->dst = &rt->u.dst;
2749         if (rtm->rtm_flags & RTM_F_NOTIFY)
2750                 rt->rt_flags |= RTCF_NOTIFY;
2751
2752         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2753                                 RTM_NEWROUTE, 0, 0);
2754         if (err <= 0)
2755                 goto errout_free;
2756
2757         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2758 errout:
2759         return err;
2760
2761 errout_free:
2762         kfree_skb(skb);
2763         goto errout;
2764 }
2765
2766 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2767 {
2768         struct rtable *rt;
2769         int h, s_h;
2770         int idx, s_idx;
2771
2772         s_h = cb->args[0];
2773         if (s_h < 0)
2774                 s_h = 0;
2775         s_idx = idx = cb->args[1];
2776         for (h = s_h; h <= rt_hash_mask; h++) {
2777                 rcu_read_lock_bh();
2778                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2779                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2780                         if (idx < s_idx)
2781                                 continue;
2782                         skb->dst = dst_clone(&rt->u.dst);
2783                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785                                          1, NLM_F_MULTI) <= 0) {
2786                                 dst_release(xchg(&skb->dst, NULL));
2787                                 rcu_read_unlock_bh();
2788                                 goto done;
2789                         }
2790                         dst_release(xchg(&skb->dst, NULL));
2791                 }
2792                 rcu_read_unlock_bh();
2793                 s_idx = 0;
2794         }
2795
2796 done:
2797         cb->args[0] = h;
2798         cb->args[1] = idx;
2799         return skb->len;
2800 }
2801
2802 void ip_rt_multicast_event(struct in_device *in_dev)
2803 {
2804         rt_cache_flush(0);
2805 }
2806
2807 #ifdef CONFIG_SYSCTL
2808 static int flush_delay;
2809
2810 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2811                                         struct file *filp, void __user *buffer,
2812                                         size_t *lenp, loff_t *ppos)
2813 {
2814         if (write) {
2815                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2816                 rt_cache_flush(flush_delay);
2817                 return 0;
2818         }
2819
2820         return -EINVAL;
2821 }
2822
2823 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2824                                                 int __user *name,
2825                                                 int nlen,
2826                                                 void __user *oldval,
2827                                                 size_t __user *oldlenp,
2828                                                 void __user *newval,
2829                                                 size_t newlen)
2830 {
2831         int delay;
2832         if (newlen != sizeof(int))
2833                 return -EINVAL;
2834         if (get_user(delay, (int __user *)newval))
2835                 return -EFAULT;
2836         rt_cache_flush(delay);
2837         return 0;
2838 }
2839
2840 ctl_table ipv4_route_table[] = {
2841         {
2842                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2843                 .procname       = "flush",
2844                 .data           = &flush_delay,
2845                 .maxlen         = sizeof(int),
2846                 .mode           = 0200,
2847                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2848                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2849         },
2850         {
2851                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2852                 .procname       = "min_delay",
2853                 .data           = &ip_rt_min_delay,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = &proc_dointvec_jiffies,
2857                 .strategy       = &sysctl_jiffies,
2858         },
2859         {
2860                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2861                 .procname       = "max_delay",
2862                 .data           = &ip_rt_max_delay,
2863                 .maxlen         = sizeof(int),
2864                 .mode           = 0644,
2865                 .proc_handler   = &proc_dointvec_jiffies,
2866                 .strategy       = &sysctl_jiffies,
2867         },
2868         {
2869                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2870                 .procname       = "gc_thresh",
2871                 .data           = &ipv4_dst_ops.gc_thresh,
2872                 .maxlen         = sizeof(int),
2873                 .mode           = 0644,
2874                 .proc_handler   = &proc_dointvec,
2875         },
2876         {
2877                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2878                 .procname       = "max_size",
2879                 .data           = &ip_rt_max_size,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = &proc_dointvec,
2883         },
2884         {
2885                 /*  Deprecated. Use gc_min_interval_ms */
2886
2887                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2888                 .procname       = "gc_min_interval",
2889                 .data           = &ip_rt_gc_min_interval,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = &proc_dointvec_jiffies,
2893                 .strategy       = &sysctl_jiffies,
2894         },
2895         {
2896                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2897                 .procname       = "gc_min_interval_ms",
2898                 .data           = &ip_rt_gc_min_interval,
2899                 .maxlen         = sizeof(int),
2900                 .mode           = 0644,
2901                 .proc_handler   = &proc_dointvec_ms_jiffies,
2902                 .strategy       = &sysctl_ms_jiffies,
2903         },
2904         {
2905                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2906                 .procname       = "gc_timeout",
2907                 .data           = &ip_rt_gc_timeout,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0644,
2910                 .proc_handler   = &proc_dointvec_jiffies,
2911                 .strategy       = &sysctl_jiffies,
2912         },
2913         {
2914                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2915                 .procname       = "gc_interval",
2916                 .data           = &ip_rt_gc_interval,
2917                 .maxlen         = sizeof(int),
2918                 .mode           = 0644,
2919                 .proc_handler   = &proc_dointvec_jiffies,
2920                 .strategy       = &sysctl_jiffies,
2921         },
2922         {
2923                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2924                 .procname       = "redirect_load",
2925                 .data           = &ip_rt_redirect_load,
2926                 .maxlen         = sizeof(int),
2927                 .mode           = 0644,
2928                 .proc_handler   = &proc_dointvec,
2929         },
2930         {
2931                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2932                 .procname       = "redirect_number",
2933                 .data           = &ip_rt_redirect_number,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = &proc_dointvec,
2937         },
2938         {
2939                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2940                 .procname       = "redirect_silence",
2941                 .data           = &ip_rt_redirect_silence,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = &proc_dointvec,
2945         },
2946         {
2947                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2948                 .procname       = "error_cost",
2949                 .data           = &ip_rt_error_cost,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = &proc_dointvec,
2953         },
2954         {
2955                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2956                 .procname       = "error_burst",
2957                 .data           = &ip_rt_error_burst,
2958                 .maxlen         = sizeof(int),
2959                 .mode           = 0644,
2960                 .proc_handler   = &proc_dointvec,
2961         },
2962         {
2963                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2964                 .procname       = "gc_elasticity",
2965                 .data           = &ip_rt_gc_elasticity,
2966                 .maxlen         = sizeof(int),
2967                 .mode           = 0644,
2968                 .proc_handler   = &proc_dointvec,
2969         },
2970         {
2971                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2972                 .procname       = "mtu_expires",
2973                 .data           = &ip_rt_mtu_expires,
2974                 .maxlen         = sizeof(int),
2975                 .mode           = 0644,
2976                 .proc_handler   = &proc_dointvec_jiffies,
2977                 .strategy       = &sysctl_jiffies,
2978         },
2979         {
2980                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2981                 .procname       = "min_pmtu",
2982                 .data           = &ip_rt_min_pmtu,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = &proc_dointvec,
2986         },
2987         {
2988                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2989                 .procname       = "min_adv_mss",
2990                 .data           = &ip_rt_min_advmss,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = &proc_dointvec,
2994         },
2995         {
2996                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2997                 .procname       = "secret_interval",
2998                 .data           = &ip_rt_secret_interval,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = &proc_dointvec_jiffies,
3002                 .strategy       = &sysctl_jiffies,
3003         },
3004         { .ctl_name = 0 }
3005 };
3006 #endif
3007
3008 #ifdef CONFIG_NET_CLS_ROUTE
3009 struct ip_rt_acct *ip_rt_acct __read_mostly;
3010 #endif /* CONFIG_NET_CLS_ROUTE */
3011
3012 static __initdata unsigned long rhash_entries;
3013 static int __init set_rhash_entries(char *str)
3014 {
3015         if (!str)
3016                 return 0;
3017         rhash_entries = simple_strtoul(str, &str, 0);
3018         return 1;
3019 }
3020 __setup("rhash_entries=", set_rhash_entries);
3021
3022 int __init ip_rt_init(void)
3023 {
3024         int rc = 0;
3025
3026         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3027                              (jiffies ^ (jiffies >> 7)));
3028
3029 #ifdef CONFIG_NET_CLS_ROUTE
3030         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3031         if (!ip_rt_acct)
3032                 panic("IP: failed to allocate ip_rt_acct\n");
3033 #endif
3034
3035         ipv4_dst_ops.kmem_cachep =
3036                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3037                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3038
3039         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3040
3041         rt_hash_table = (struct rt_hash_bucket *)
3042                 alloc_large_system_hash("IP route cache",
3043                                         sizeof(struct rt_hash_bucket),
3044                                         rhash_entries,
3045                                         (num_physpages >= 128 * 1024) ?
3046                                         15 : 17,
3047                                         0,
3048                                         &rt_hash_log,
3049                                         &rt_hash_mask,
3050                                         0);
3051         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3052         rt_hash_lock_init();
3053
3054         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3055         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3056
3057         devinet_init();
3058         ip_fib_init();
3059
3060         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3061         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3062
3063         /* All the timers, started at system startup tend
3064            to synchronize. Perturb it a bit.
3065          */
3066         schedule_delayed_work(&expires_work,
3067                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3068
3069         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3070                 ip_rt_secret_interval;
3071         add_timer(&rt_secret_timer);
3072
3073         if (ip_rt_proc_init(&init_net))
3074                 printk(KERN_ERR "Unable to create route proc files\n");
3075 #ifdef CONFIG_XFRM
3076         xfrm_init();
3077         xfrm4_init();
3078 #endif
3079         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3080
3081         return rc;
3082 }
3083
3084 EXPORT_SYMBOL(__ip_select_ident);
3085 EXPORT_SYMBOL(ip_route_input);
3086 EXPORT_SYMBOL(ip_route_output_key);