]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/netfilter/nf_conntrack_core.c
[NETFILTER]: conntrack: add fixed timeout flag in connection tracking
[linux-2.6-omap-h63xx.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27  *      - add support for L3 protocol module load on demand.
28  *
29  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30  */
31
32 #include <linux/config.h>
33 #include <linux/types.h>
34 #include <linux/netfilter.h>
35 #include <linux/module.h>
36 #include <linux/skbuff.h>
37 #include <linux/proc_fs.h>
38 #include <linux/vmalloc.h>
39 #include <linux/stddef.h>
40 #include <linux/slab.h>
41 #include <linux/random.h>
42 #include <linux/jhash.h>
43 #include <linux/err.h>
44 #include <linux/percpu.h>
45 #include <linux/moduleparam.h>
46 #include <linux/notifier.h>
47 #include <linux/kernel.h>
48 #include <linux/netdevice.h>
49 #include <linux/socket.h>
50
51 /* This rwlock protects the main hash table, protocol/helper/expected
52    registrations, conntrack timers*/
53 #define ASSERT_READ_LOCK(x)
54 #define ASSERT_WRITE_LOCK(x)
55
56 #include <net/netfilter/nf_conntrack.h>
57 #include <net/netfilter/nf_conntrack_l3proto.h>
58 #include <net/netfilter/nf_conntrack_protocol.h>
59 #include <net/netfilter/nf_conntrack_helper.h>
60 #include <net/netfilter/nf_conntrack_core.h>
61 #include <linux/netfilter_ipv4/listhelp.h>
62
63 #define NF_CONNTRACK_VERSION    "0.5.0"
64
65 #if 0
66 #define DEBUGP printk
67 #else
68 #define DEBUGP(format, args...)
69 #endif
70
71 DEFINE_RWLOCK(nf_conntrack_lock);
72
73 /* nf_conntrack_standalone needs this */
74 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77 LIST_HEAD(nf_conntrack_expect_list);
78 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80 static LIST_HEAD(helpers);
81 unsigned int nf_conntrack_htable_size = 0;
82 int nf_conntrack_max;
83 struct list_head *nf_conntrack_hash;
84 static kmem_cache_t *nf_conntrack_expect_cachep;
85 struct nf_conn nf_conntrack_untracked;
86 unsigned int nf_ct_log_invalid;
87 static LIST_HEAD(unconfirmed);
88 static int nf_conntrack_vmalloc;
89
90 static unsigned int nf_conntrack_next_id;
91 static unsigned int nf_conntrack_expect_next_id;
92 #ifdef CONFIG_NF_CONNTRACK_EVENTS
93 ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
94 ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
95
96 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98 /* deliver cached events and clear cache entry - must be called with locally
99  * disabled softirqs */
100 static inline void
101 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102 {
103         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105             && ecache->events)
106                 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
107                                     ecache->ct);
108
109         ecache->events = 0;
110         nf_ct_put(ecache->ct);
111         ecache->ct = NULL;
112 }
113
114 /* Deliver all cached events for a particular conntrack. This is called
115  * by code prior to async packet handling for freeing the skb */
116 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117 {
118         struct nf_conntrack_ecache *ecache;
119
120         local_bh_disable();
121         ecache = &__get_cpu_var(nf_conntrack_ecache);
122         if (ecache->ct == ct)
123                 __nf_ct_deliver_cached_events(ecache);
124         local_bh_enable();
125 }
126
127 /* Deliver cached events for old pending events, if current conntrack != old */
128 void __nf_ct_event_cache_init(struct nf_conn *ct)
129 {
130         struct nf_conntrack_ecache *ecache;
131         
132         /* take care of delivering potentially old events */
133         ecache = &__get_cpu_var(nf_conntrack_ecache);
134         BUG_ON(ecache->ct == ct);
135         if (ecache->ct)
136                 __nf_ct_deliver_cached_events(ecache);
137         /* initialize for this conntrack/packet */
138         ecache->ct = ct;
139         nf_conntrack_get(&ct->ct_general);
140 }
141
142 /* flush the event cache - touches other CPU's data and must not be called
143  * while packets are still passing through the code */
144 static void nf_ct_event_cache_flush(void)
145 {
146         struct nf_conntrack_ecache *ecache;
147         int cpu;
148
149         for_each_possible_cpu(cpu) {
150                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
151                 if (ecache->ct)
152                         nf_ct_put(ecache->ct);
153         }
154 }
155 #else
156 static inline void nf_ct_event_cache_flush(void) {}
157 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162 /*
163  * This scheme offers various size of "struct nf_conn" dependent on
164  * features(helper, nat, ...)
165  */
166
167 #define NF_CT_FEATURES_NAMELEN  256
168 static struct {
169         /* name of slab cache. printed in /proc/slabinfo */
170         char *name;
171
172         /* size of slab cache */
173         size_t size;
174
175         /* slab cache pointer */
176         kmem_cache_t *cachep;
177
178         /* allocated slab cache + modules which uses this slab cache */
179         int use;
180
181 } nf_ct_cache[NF_CT_F_NUM];
182
183 /* protect members of nf_ct_cache except of "use" */
184 DEFINE_RWLOCK(nf_ct_cache_lock);
185
186 /* This avoids calling kmem_cache_create() with same name simultaneously */
187 static DEFINE_MUTEX(nf_ct_cache_mutex);
188
189 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
190 struct nf_conntrack_protocol *
191 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
192 {
193         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
194                 return &nf_conntrack_generic_protocol;
195
196         return nf_ct_protos[l3proto][protocol];
197 }
198
199 /* this is guaranteed to always return a valid protocol helper, since
200  * it falls back to generic_protocol */
201 struct nf_conntrack_protocol *
202 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
203 {
204         struct nf_conntrack_protocol *p;
205
206         preempt_disable();
207         p = __nf_ct_proto_find(l3proto, protocol);
208         if (!try_module_get(p->me))
209                 p = &nf_conntrack_generic_protocol;
210         preempt_enable();
211         
212         return p;
213 }
214
215 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
216 {
217         module_put(p->me);
218 }
219
220 struct nf_conntrack_l3proto *
221 nf_ct_l3proto_find_get(u_int16_t l3proto)
222 {
223         struct nf_conntrack_l3proto *p;
224
225         preempt_disable();
226         p = __nf_ct_l3proto_find(l3proto);
227         if (!try_module_get(p->me))
228                 p = &nf_conntrack_generic_l3proto;
229         preempt_enable();
230
231         return p;
232 }
233
234 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
235 {
236         module_put(p->me);
237 }
238
239 int
240 nf_ct_l3proto_try_module_get(unsigned short l3proto)
241 {
242         int ret;
243         struct nf_conntrack_l3proto *p;
244
245 retry:  p = nf_ct_l3proto_find_get(l3proto);
246         if (p == &nf_conntrack_generic_l3proto) {
247                 ret = request_module("nf_conntrack-%d", l3proto);
248                 if (!ret)
249                         goto retry;
250
251                 return -EPROTOTYPE;
252         }
253
254         return 0;
255 }
256
257 void nf_ct_l3proto_module_put(unsigned short l3proto)
258 {
259         struct nf_conntrack_l3proto *p;
260
261         preempt_disable();
262         p = __nf_ct_l3proto_find(l3proto);
263         preempt_enable();
264
265         module_put(p->me);
266 }
267
268 static int nf_conntrack_hash_rnd_initted;
269 static unsigned int nf_conntrack_hash_rnd;
270
271 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
272                                   unsigned int size, unsigned int rnd)
273 {
274         unsigned int a, b;
275         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
276                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
277         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
278                         (tuple->src.u.all << 16) | tuple->dst.u.all);
279
280         return jhash_2words(a, b, rnd) % size;
281 }
282
283 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
284 {
285         return __hash_conntrack(tuple, nf_conntrack_htable_size,
286                                 nf_conntrack_hash_rnd);
287 }
288
289 int nf_conntrack_register_cache(u_int32_t features, const char *name,
290                                 size_t size)
291 {
292         int ret = 0;
293         char *cache_name;
294         kmem_cache_t *cachep;
295
296         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
297                features, name, size);
298
299         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
300                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
301                         features);
302                 return -EINVAL;
303         }
304
305         mutex_lock(&nf_ct_cache_mutex);
306
307         write_lock_bh(&nf_ct_cache_lock);
308         /* e.g: multiple helpers are loaded */
309         if (nf_ct_cache[features].use > 0) {
310                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
311                 if ((!strncmp(nf_ct_cache[features].name, name,
312                               NF_CT_FEATURES_NAMELEN))
313                     && nf_ct_cache[features].size == size) {
314                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
315                         nf_ct_cache[features].use++;
316                         ret = 0;
317                 } else
318                         ret = -EBUSY;
319
320                 write_unlock_bh(&nf_ct_cache_lock);
321                 mutex_unlock(&nf_ct_cache_mutex);
322                 return ret;
323         }
324         write_unlock_bh(&nf_ct_cache_lock);
325
326         /*
327          * The memory space for name of slab cache must be alive until
328          * cache is destroyed.
329          */
330         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
331         if (cache_name == NULL) {
332                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
333                 ret = -ENOMEM;
334                 goto out_up_mutex;
335         }
336
337         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
338                                                 >= NF_CT_FEATURES_NAMELEN) {
339                 printk("nf_conntrack_register_cache: name too long\n");
340                 ret = -EINVAL;
341                 goto out_free_name;
342         }
343
344         cachep = kmem_cache_create(cache_name, size, 0, 0,
345                                    NULL, NULL);
346         if (!cachep) {
347                 printk("nf_conntrack_register_cache: Can't create slab cache "
348                        "for the features = 0x%x\n", features);
349                 ret = -ENOMEM;
350                 goto out_free_name;
351         }
352
353         write_lock_bh(&nf_ct_cache_lock);
354         nf_ct_cache[features].use = 1;
355         nf_ct_cache[features].size = size;
356         nf_ct_cache[features].cachep = cachep;
357         nf_ct_cache[features].name = cache_name;
358         write_unlock_bh(&nf_ct_cache_lock);
359
360         goto out_up_mutex;
361
362 out_free_name:
363         kfree(cache_name);
364 out_up_mutex:
365         mutex_unlock(&nf_ct_cache_mutex);
366         return ret;
367 }
368
369 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
370 void nf_conntrack_unregister_cache(u_int32_t features)
371 {
372         kmem_cache_t *cachep;
373         char *name;
374
375         /*
376          * This assures that kmem_cache_create() isn't called before destroying
377          * slab cache.
378          */
379         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
380         mutex_lock(&nf_ct_cache_mutex);
381
382         write_lock_bh(&nf_ct_cache_lock);
383         if (--nf_ct_cache[features].use > 0) {
384                 write_unlock_bh(&nf_ct_cache_lock);
385                 mutex_unlock(&nf_ct_cache_mutex);
386                 return;
387         }
388         cachep = nf_ct_cache[features].cachep;
389         name = nf_ct_cache[features].name;
390         nf_ct_cache[features].cachep = NULL;
391         nf_ct_cache[features].name = NULL;
392         nf_ct_cache[features].size = 0;
393         write_unlock_bh(&nf_ct_cache_lock);
394
395         synchronize_net();
396
397         kmem_cache_destroy(cachep);
398         kfree(name);
399
400         mutex_unlock(&nf_ct_cache_mutex);
401 }
402
403 int
404 nf_ct_get_tuple(const struct sk_buff *skb,
405                 unsigned int nhoff,
406                 unsigned int dataoff,
407                 u_int16_t l3num,
408                 u_int8_t protonum,
409                 struct nf_conntrack_tuple *tuple,
410                 const struct nf_conntrack_l3proto *l3proto,
411                 const struct nf_conntrack_protocol *protocol)
412 {
413         NF_CT_TUPLE_U_BLANK(tuple);
414
415         tuple->src.l3num = l3num;
416         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
417                 return 0;
418
419         tuple->dst.protonum = protonum;
420         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
421
422         return protocol->pkt_to_tuple(skb, dataoff, tuple);
423 }
424
425 int
426 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
427                    const struct nf_conntrack_tuple *orig,
428                    const struct nf_conntrack_l3proto *l3proto,
429                    const struct nf_conntrack_protocol *protocol)
430 {
431         NF_CT_TUPLE_U_BLANK(inverse);
432
433         inverse->src.l3num = orig->src.l3num;
434         if (l3proto->invert_tuple(inverse, orig) == 0)
435                 return 0;
436
437         inverse->dst.dir = !orig->dst.dir;
438
439         inverse->dst.protonum = orig->dst.protonum;
440         return protocol->invert_tuple(inverse, orig);
441 }
442
443 /* nf_conntrack_expect helper functions */
444 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
445 {
446         struct nf_conn_help *master_help = nfct_help(exp->master);
447
448         NF_CT_ASSERT(master_help);
449         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
450         NF_CT_ASSERT(!timer_pending(&exp->timeout));
451
452         list_del(&exp->list);
453         NF_CT_STAT_INC(expect_delete);
454         master_help->expecting--;
455         nf_conntrack_expect_put(exp);
456 }
457
458 static void expectation_timed_out(unsigned long ul_expect)
459 {
460         struct nf_conntrack_expect *exp = (void *)ul_expect;
461
462         write_lock_bh(&nf_conntrack_lock);
463         nf_ct_unlink_expect(exp);
464         write_unlock_bh(&nf_conntrack_lock);
465         nf_conntrack_expect_put(exp);
466 }
467
468 struct nf_conntrack_expect *
469 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
470 {
471         struct nf_conntrack_expect *i;
472         
473         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
474                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
475                         atomic_inc(&i->use);
476                         return i;
477                 }
478         }
479         return NULL;
480 }
481
482 /* Just find a expectation corresponding to a tuple. */
483 struct nf_conntrack_expect *
484 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
485 {
486         struct nf_conntrack_expect *i;
487         
488         read_lock_bh(&nf_conntrack_lock);
489         i = __nf_conntrack_expect_find(tuple);
490         read_unlock_bh(&nf_conntrack_lock);
491
492         return i;
493 }
494
495 /* If an expectation for this connection is found, it gets delete from
496  * global list then returned. */
497 static struct nf_conntrack_expect *
498 find_expectation(const struct nf_conntrack_tuple *tuple)
499 {
500         struct nf_conntrack_expect *i;
501
502         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
503         /* If master is not in hash table yet (ie. packet hasn't left
504            this machine yet), how can other end know about expected?
505            Hence these are not the droids you are looking for (if
506            master ct never got confirmed, we'd hold a reference to it
507            and weird things would happen to future packets). */
508                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
509                     && nf_ct_is_confirmed(i->master)) {
510                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
511                                 atomic_inc(&i->use);
512                                 return i;
513                         } else if (del_timer(&i->timeout)) {
514                                 nf_ct_unlink_expect(i);
515                                 return i;
516                         }
517                 }
518         }
519         return NULL;
520 }
521
522 /* delete all expectations for this conntrack */
523 void nf_ct_remove_expectations(struct nf_conn *ct)
524 {
525         struct nf_conntrack_expect *i, *tmp;
526         struct nf_conn_help *help = nfct_help(ct);
527
528         /* Optimization: most connection never expect any others. */
529         if (!help || help->expecting == 0)
530                 return;
531
532         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
533                 if (i->master == ct && del_timer(&i->timeout)) {
534                         nf_ct_unlink_expect(i);
535                         nf_conntrack_expect_put(i);
536                 }
537         }
538 }
539
540 static void
541 clean_from_lists(struct nf_conn *ct)
542 {
543         unsigned int ho, hr;
544         
545         DEBUGP("clean_from_lists(%p)\n", ct);
546         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
547
548         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
549         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
550         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
551         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
552
553         /* Destroy all pending expectations */
554         nf_ct_remove_expectations(ct);
555 }
556
557 static void
558 destroy_conntrack(struct nf_conntrack *nfct)
559 {
560         struct nf_conn *ct = (struct nf_conn *)nfct;
561         struct nf_conntrack_l3proto *l3proto;
562         struct nf_conntrack_protocol *proto;
563
564         DEBUGP("destroy_conntrack(%p)\n", ct);
565         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
566         NF_CT_ASSERT(!timer_pending(&ct->timeout));
567
568         nf_conntrack_event(IPCT_DESTROY, ct);
569         set_bit(IPS_DYING_BIT, &ct->status);
570
571         /* To make sure we don't get any weird locking issues here:
572          * destroy_conntrack() MUST NOT be called with a write lock
573          * to nf_conntrack_lock!!! -HW */
574         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
575         if (l3proto && l3proto->destroy)
576                 l3proto->destroy(ct);
577
578         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
579         if (proto && proto->destroy)
580                 proto->destroy(ct);
581
582         if (nf_conntrack_destroyed)
583                 nf_conntrack_destroyed(ct);
584
585         write_lock_bh(&nf_conntrack_lock);
586         /* Expectations will have been removed in clean_from_lists,
587          * except TFTP can create an expectation on the first packet,
588          * before connection is in the list, so we need to clean here,
589          * too. */
590         nf_ct_remove_expectations(ct);
591
592         /* We overload first tuple to link into unconfirmed list. */
593         if (!nf_ct_is_confirmed(ct)) {
594                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
595                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
596         }
597
598         NF_CT_STAT_INC(delete);
599         write_unlock_bh(&nf_conntrack_lock);
600
601         if (ct->master)
602                 nf_ct_put(ct->master);
603
604         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
605         nf_conntrack_free(ct);
606 }
607
608 static void death_by_timeout(unsigned long ul_conntrack)
609 {
610         struct nf_conn *ct = (void *)ul_conntrack;
611
612         write_lock_bh(&nf_conntrack_lock);
613         /* Inside lock so preempt is disabled on module removal path.
614          * Otherwise we can get spurious warnings. */
615         NF_CT_STAT_INC(delete_list);
616         clean_from_lists(ct);
617         write_unlock_bh(&nf_conntrack_lock);
618         nf_ct_put(ct);
619 }
620
621 static inline int
622 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
623                     const struct nf_conntrack_tuple *tuple,
624                     const struct nf_conn *ignored_conntrack)
625 {
626         ASSERT_READ_LOCK(&nf_conntrack_lock);
627         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
628                 && nf_ct_tuple_equal(tuple, &i->tuple);
629 }
630
631 struct nf_conntrack_tuple_hash *
632 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
633                     const struct nf_conn *ignored_conntrack)
634 {
635         struct nf_conntrack_tuple_hash *h;
636         unsigned int hash = hash_conntrack(tuple);
637
638         ASSERT_READ_LOCK(&nf_conntrack_lock);
639         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
640                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
641                         NF_CT_STAT_INC(found);
642                         return h;
643                 }
644                 NF_CT_STAT_INC(searched);
645         }
646
647         return NULL;
648 }
649
650 /* Find a connection corresponding to a tuple. */
651 struct nf_conntrack_tuple_hash *
652 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
653                       const struct nf_conn *ignored_conntrack)
654 {
655         struct nf_conntrack_tuple_hash *h;
656
657         read_lock_bh(&nf_conntrack_lock);
658         h = __nf_conntrack_find(tuple, ignored_conntrack);
659         if (h)
660                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
661         read_unlock_bh(&nf_conntrack_lock);
662
663         return h;
664 }
665
666 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
667                                        unsigned int hash,
668                                        unsigned int repl_hash) 
669 {
670         ct->id = ++nf_conntrack_next_id;
671         list_prepend(&nf_conntrack_hash[hash],
672                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
673         list_prepend(&nf_conntrack_hash[repl_hash],
674                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
675 }
676
677 void nf_conntrack_hash_insert(struct nf_conn *ct)
678 {
679         unsigned int hash, repl_hash;
680
681         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
682         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
683
684         write_lock_bh(&nf_conntrack_lock);
685         __nf_conntrack_hash_insert(ct, hash, repl_hash);
686         write_unlock_bh(&nf_conntrack_lock);
687 }
688
689 /* Confirm a connection given skb; places it in hash table */
690 int
691 __nf_conntrack_confirm(struct sk_buff **pskb)
692 {
693         unsigned int hash, repl_hash;
694         struct nf_conn *ct;
695         enum ip_conntrack_info ctinfo;
696
697         ct = nf_ct_get(*pskb, &ctinfo);
698
699         /* ipt_REJECT uses nf_conntrack_attach to attach related
700            ICMP/TCP RST packets in other direction.  Actual packet
701            which created connection will be IP_CT_NEW or for an
702            expected connection, IP_CT_RELATED. */
703         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
704                 return NF_ACCEPT;
705
706         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
707         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
708
709         /* We're not in hash table, and we refuse to set up related
710            connections for unconfirmed conns.  But packet copies and
711            REJECT will give spurious warnings here. */
712         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
713
714         /* No external references means noone else could have
715            confirmed us. */
716         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
717         DEBUGP("Confirming conntrack %p\n", ct);
718
719         write_lock_bh(&nf_conntrack_lock);
720
721         /* See if there's one in the list already, including reverse:
722            NAT could have grabbed it without realizing, since we're
723            not in the hash.  If there is, we lost race. */
724         if (!LIST_FIND(&nf_conntrack_hash[hash],
725                        conntrack_tuple_cmp,
726                        struct nf_conntrack_tuple_hash *,
727                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
728             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
729                           conntrack_tuple_cmp,
730                           struct nf_conntrack_tuple_hash *,
731                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
732                 struct nf_conn_help *help;
733                 /* Remove from unconfirmed list */
734                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
735
736                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
737                 /* Timer relative to confirmation time, not original
738                    setting time, otherwise we'd get timer wrap in
739                    weird delay cases. */
740                 ct->timeout.expires += jiffies;
741                 add_timer(&ct->timeout);
742                 atomic_inc(&ct->ct_general.use);
743                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
744                 NF_CT_STAT_INC(insert);
745                 write_unlock_bh(&nf_conntrack_lock);
746                 help = nfct_help(ct);
747                 if (help && help->helper)
748                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
749 #ifdef CONFIG_NF_NAT_NEEDED
750                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
751                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
752                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
753 #endif
754                 nf_conntrack_event_cache(master_ct(ct) ?
755                                          IPCT_RELATED : IPCT_NEW, *pskb);
756                 return NF_ACCEPT;
757         }
758
759         NF_CT_STAT_INC(insert_failed);
760         write_unlock_bh(&nf_conntrack_lock);
761         return NF_DROP;
762 }
763
764 /* Returns true if a connection correspondings to the tuple (required
765    for NAT). */
766 int
767 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
768                          const struct nf_conn *ignored_conntrack)
769 {
770         struct nf_conntrack_tuple_hash *h;
771
772         read_lock_bh(&nf_conntrack_lock);
773         h = __nf_conntrack_find(tuple, ignored_conntrack);
774         read_unlock_bh(&nf_conntrack_lock);
775
776         return h != NULL;
777 }
778
779 /* There's a small race here where we may free a just-assured
780    connection.  Too bad: we're in trouble anyway. */
781 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
782 {
783         return !(test_bit(IPS_ASSURED_BIT,
784                           &nf_ct_tuplehash_to_ctrack(i)->status));
785 }
786
787 static int early_drop(struct list_head *chain)
788 {
789         /* Traverse backwards: gives us oldest, which is roughly LRU */
790         struct nf_conntrack_tuple_hash *h;
791         struct nf_conn *ct = NULL;
792         int dropped = 0;
793
794         read_lock_bh(&nf_conntrack_lock);
795         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
796         if (h) {
797                 ct = nf_ct_tuplehash_to_ctrack(h);
798                 atomic_inc(&ct->ct_general.use);
799         }
800         read_unlock_bh(&nf_conntrack_lock);
801
802         if (!ct)
803                 return dropped;
804
805         if (del_timer(&ct->timeout)) {
806                 death_by_timeout((unsigned long)ct);
807                 dropped = 1;
808                 NF_CT_STAT_INC(early_drop);
809         }
810         nf_ct_put(ct);
811         return dropped;
812 }
813
814 static inline int helper_cmp(const struct nf_conntrack_helper *i,
815                              const struct nf_conntrack_tuple *rtuple)
816 {
817         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
818 }
819
820 static struct nf_conntrack_helper *
821 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
822 {
823         return LIST_FIND(&helpers, helper_cmp,
824                          struct nf_conntrack_helper *,
825                          tuple);
826 }
827
828 struct nf_conntrack_helper *
829 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
830 {
831         struct nf_conntrack_helper *helper;
832
833         /* need nf_conntrack_lock to assure that helper exists until
834          * try_module_get() is called */
835         read_lock_bh(&nf_conntrack_lock);
836
837         helper = __nf_ct_helper_find(tuple);
838         if (helper) {
839                 /* need to increase module usage count to assure helper will
840                  * not go away while the caller is e.g. busy putting a
841                  * conntrack in the hash that uses the helper */
842                 if (!try_module_get(helper->me))
843                         helper = NULL;
844         }
845
846         read_unlock_bh(&nf_conntrack_lock);
847
848         return helper;
849 }
850
851 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
852 {
853         module_put(helper->me);
854 }
855
856 static struct nf_conn *
857 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
858                      const struct nf_conntrack_tuple *repl,
859                      const struct nf_conntrack_l3proto *l3proto)
860 {
861         struct nf_conn *conntrack = NULL;
862         u_int32_t features = 0;
863         struct nf_conntrack_helper *helper;
864
865         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
866                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
867                 nf_conntrack_hash_rnd_initted = 1;
868         }
869
870         if (nf_conntrack_max
871             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
872                 unsigned int hash = hash_conntrack(orig);
873                 /* Try dropping from this hash chain. */
874                 if (!early_drop(&nf_conntrack_hash[hash])) {
875                         if (net_ratelimit())
876                                 printk(KERN_WARNING
877                                        "nf_conntrack: table full, dropping"
878                                        " packet.\n");
879                         return ERR_PTR(-ENOMEM);
880                 }
881         }
882
883         /*  find features needed by this conntrack. */
884         features = l3proto->get_features(orig);
885
886         /* FIXME: protect helper list per RCU */
887         read_lock_bh(&nf_conntrack_lock);
888         helper = __nf_ct_helper_find(repl);
889         if (helper)
890                 features |= NF_CT_F_HELP;
891         read_unlock_bh(&nf_conntrack_lock);
892
893         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
894
895         read_lock_bh(&nf_ct_cache_lock);
896
897         if (unlikely(!nf_ct_cache[features].use)) {
898                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
899                         features);
900                 goto out;
901         }
902
903         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
904         if (conntrack == NULL) {
905                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
906                 goto out;
907         }
908
909         memset(conntrack, 0, nf_ct_cache[features].size);
910         conntrack->features = features;
911         if (helper) {
912                 struct nf_conn_help *help = nfct_help(conntrack);
913                 NF_CT_ASSERT(help);
914                 help->helper = helper;
915         }
916
917         atomic_set(&conntrack->ct_general.use, 1);
918         conntrack->ct_general.destroy = destroy_conntrack;
919         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
920         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
921         /* Don't set timer yet: wait for confirmation */
922         init_timer(&conntrack->timeout);
923         conntrack->timeout.data = (unsigned long)conntrack;
924         conntrack->timeout.function = death_by_timeout;
925
926         atomic_inc(&nf_conntrack_count);
927 out:
928         read_unlock_bh(&nf_ct_cache_lock);
929         return conntrack;
930 }
931
932 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
933                                    const struct nf_conntrack_tuple *repl)
934 {
935         struct nf_conntrack_l3proto *l3proto;
936
937         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
938         return __nf_conntrack_alloc(orig, repl, l3proto);
939 }
940
941 void nf_conntrack_free(struct nf_conn *conntrack)
942 {
943         u_int32_t features = conntrack->features;
944         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
945         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
946                conntrack);
947         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
948         atomic_dec(&nf_conntrack_count);
949 }
950
951 /* Allocate a new conntrack: we return -ENOMEM if classification
952    failed due to stress.  Otherwise it really is unclassifiable. */
953 static struct nf_conntrack_tuple_hash *
954 init_conntrack(const struct nf_conntrack_tuple *tuple,
955                struct nf_conntrack_l3proto *l3proto,
956                struct nf_conntrack_protocol *protocol,
957                struct sk_buff *skb,
958                unsigned int dataoff)
959 {
960         struct nf_conn *conntrack;
961         struct nf_conntrack_tuple repl_tuple;
962         struct nf_conntrack_expect *exp;
963
964         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
965                 DEBUGP("Can't invert tuple.\n");
966                 return NULL;
967         }
968
969         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
970         if (conntrack == NULL || IS_ERR(conntrack)) {
971                 DEBUGP("Can't allocate conntrack.\n");
972                 return (struct nf_conntrack_tuple_hash *)conntrack;
973         }
974
975         if (!protocol->new(conntrack, skb, dataoff)) {
976                 nf_conntrack_free(conntrack);
977                 DEBUGP("init conntrack: can't track with proto module\n");
978                 return NULL;
979         }
980
981         write_lock_bh(&nf_conntrack_lock);
982         exp = find_expectation(tuple);
983
984         if (exp) {
985                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
986                         conntrack, exp);
987                 /* Welcome, Mr. Bond.  We've been expecting you... */
988                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
989                 conntrack->master = exp->master;
990 #ifdef CONFIG_NF_CONNTRACK_MARK
991                 conntrack->mark = exp->master->mark;
992 #endif
993                 nf_conntrack_get(&conntrack->master->ct_general);
994                 NF_CT_STAT_INC(expect_new);
995         } else
996                 NF_CT_STAT_INC(new);
997
998         /* Overload tuple linked list to put us in unconfirmed list. */
999         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1000
1001         write_unlock_bh(&nf_conntrack_lock);
1002
1003         if (exp) {
1004                 if (exp->expectfn)
1005                         exp->expectfn(conntrack, exp);
1006                 nf_conntrack_expect_put(exp);
1007         }
1008
1009         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1010 }
1011
1012 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1013 static inline struct nf_conn *
1014 resolve_normal_ct(struct sk_buff *skb,
1015                   unsigned int dataoff,
1016                   u_int16_t l3num,
1017                   u_int8_t protonum,
1018                   struct nf_conntrack_l3proto *l3proto,
1019                   struct nf_conntrack_protocol *proto,
1020                   int *set_reply,
1021                   enum ip_conntrack_info *ctinfo)
1022 {
1023         struct nf_conntrack_tuple tuple;
1024         struct nf_conntrack_tuple_hash *h;
1025         struct nf_conn *ct;
1026
1027         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1028                              dataoff, l3num, protonum, &tuple, l3proto,
1029                              proto)) {
1030                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1031                 return NULL;
1032         }
1033
1034         /* look for tuple match */
1035         h = nf_conntrack_find_get(&tuple, NULL);
1036         if (!h) {
1037                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1038                 if (!h)
1039                         return NULL;
1040                 if (IS_ERR(h))
1041                         return (void *)h;
1042         }
1043         ct = nf_ct_tuplehash_to_ctrack(h);
1044
1045         /* It exists; we have (non-exclusive) reference. */
1046         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1047                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1048                 /* Please set reply bit if this packet OK */
1049                 *set_reply = 1;
1050         } else {
1051                 /* Once we've had two way comms, always ESTABLISHED. */
1052                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1053                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1054                         *ctinfo = IP_CT_ESTABLISHED;
1055                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1056                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1057                         *ctinfo = IP_CT_RELATED;
1058                 } else {
1059                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1060                         *ctinfo = IP_CT_NEW;
1061                 }
1062                 *set_reply = 0;
1063         }
1064         skb->nfct = &ct->ct_general;
1065         skb->nfctinfo = *ctinfo;
1066         return ct;
1067 }
1068
1069 unsigned int
1070 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1071 {
1072         struct nf_conn *ct;
1073         enum ip_conntrack_info ctinfo;
1074         struct nf_conntrack_l3proto *l3proto;
1075         struct nf_conntrack_protocol *proto;
1076         unsigned int dataoff;
1077         u_int8_t protonum;
1078         int set_reply = 0;
1079         int ret;
1080
1081         /* Previously seen (loopback or untracked)?  Ignore. */
1082         if ((*pskb)->nfct) {
1083                 NF_CT_STAT_INC(ignore);
1084                 return NF_ACCEPT;
1085         }
1086
1087         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1088         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1089                 DEBUGP("not prepared to track yet or error occured\n");
1090                 return -ret;
1091         }
1092
1093         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1094
1095         /* It may be an special packet, error, unclean...
1096          * inverse of the return code tells to the netfilter
1097          * core what to do with the packet. */
1098         if (proto->error != NULL &&
1099             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1100                 NF_CT_STAT_INC(error);
1101                 NF_CT_STAT_INC(invalid);
1102                 return -ret;
1103         }
1104
1105         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1106                                &set_reply, &ctinfo);
1107         if (!ct) {
1108                 /* Not valid part of a connection */
1109                 NF_CT_STAT_INC(invalid);
1110                 return NF_ACCEPT;
1111         }
1112
1113         if (IS_ERR(ct)) {
1114                 /* Too stressed to deal. */
1115                 NF_CT_STAT_INC(drop);
1116                 return NF_DROP;
1117         }
1118
1119         NF_CT_ASSERT((*pskb)->nfct);
1120
1121         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1122         if (ret < 0) {
1123                 /* Invalid: inverse of the return code tells
1124                  * the netfilter core what to do */
1125                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1126                 nf_conntrack_put((*pskb)->nfct);
1127                 (*pskb)->nfct = NULL;
1128                 NF_CT_STAT_INC(invalid);
1129                 return -ret;
1130         }
1131
1132         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1133                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1134
1135         return ret;
1136 }
1137
1138 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1139                          const struct nf_conntrack_tuple *orig)
1140 {
1141         return nf_ct_invert_tuple(inverse, orig,
1142                                   __nf_ct_l3proto_find(orig->src.l3num),
1143                                   __nf_ct_proto_find(orig->src.l3num,
1144                                                      orig->dst.protonum));
1145 }
1146
1147 /* Would two expected things clash? */
1148 static inline int expect_clash(const struct nf_conntrack_expect *a,
1149                                const struct nf_conntrack_expect *b)
1150 {
1151         /* Part covered by intersection of masks must be unequal,
1152            otherwise they clash */
1153         struct nf_conntrack_tuple intersect_mask;
1154         int count;
1155
1156         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1157         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1158         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1159         intersect_mask.dst.protonum = a->mask.dst.protonum
1160                                         & b->mask.dst.protonum;
1161
1162         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1163                 intersect_mask.src.u3.all[count] =
1164                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1165         }
1166
1167         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1168                 intersect_mask.dst.u3.all[count] =
1169                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1170         }
1171
1172         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1173 }
1174
1175 static inline int expect_matches(const struct nf_conntrack_expect *a,
1176                                  const struct nf_conntrack_expect *b)
1177 {
1178         return a->master == b->master
1179                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1180                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1181 }
1182
1183 /* Generally a bad idea to call this: could have matched already. */
1184 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1185 {
1186         struct nf_conntrack_expect *i;
1187
1188         write_lock_bh(&nf_conntrack_lock);
1189         /* choose the the oldest expectation to evict */
1190         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1191                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1192                         nf_ct_unlink_expect(i);
1193                         write_unlock_bh(&nf_conntrack_lock);
1194                         nf_conntrack_expect_put(i);
1195                         return;
1196                 }
1197         }
1198         write_unlock_bh(&nf_conntrack_lock);
1199 }
1200
1201 /* We don't increase the master conntrack refcount for non-fulfilled
1202  * conntracks. During the conntrack destruction, the expectations are
1203  * always killed before the conntrack itself */
1204 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1205 {
1206         struct nf_conntrack_expect *new;
1207
1208         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1209         if (!new) {
1210                 DEBUGP("expect_related: OOM allocating expect\n");
1211                 return NULL;
1212         }
1213         new->master = me;
1214         atomic_set(&new->use, 1);
1215         return new;
1216 }
1217
1218 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1219 {
1220         if (atomic_dec_and_test(&exp->use))
1221                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1222 }
1223
1224 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1225 {
1226         struct nf_conn_help *master_help = nfct_help(exp->master);
1227
1228         atomic_inc(&exp->use);
1229         master_help->expecting++;
1230         list_add(&exp->list, &nf_conntrack_expect_list);
1231
1232         init_timer(&exp->timeout);
1233         exp->timeout.data = (unsigned long)exp;
1234         exp->timeout.function = expectation_timed_out;
1235         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1236         add_timer(&exp->timeout);
1237
1238         exp->id = ++nf_conntrack_expect_next_id;
1239         atomic_inc(&exp->use);
1240         NF_CT_STAT_INC(expect_create);
1241 }
1242
1243 /* Race with expectations being used means we could have none to find; OK. */
1244 static void evict_oldest_expect(struct nf_conn *master)
1245 {
1246         struct nf_conntrack_expect *i;
1247
1248         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1249                 if (i->master == master) {
1250                         if (del_timer(&i->timeout)) {
1251                                 nf_ct_unlink_expect(i);
1252                                 nf_conntrack_expect_put(i);
1253                         }
1254                         break;
1255                 }
1256         }
1257 }
1258
1259 static inline int refresh_timer(struct nf_conntrack_expect *i)
1260 {
1261         struct nf_conn_help *master_help = nfct_help(i->master);
1262
1263         if (!del_timer(&i->timeout))
1264                 return 0;
1265
1266         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1267         add_timer(&i->timeout);
1268         return 1;
1269 }
1270
1271 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1272 {
1273         struct nf_conntrack_expect *i;
1274         struct nf_conn *master = expect->master;
1275         struct nf_conn_help *master_help = nfct_help(master);
1276         int ret;
1277
1278         NF_CT_ASSERT(master_help);
1279
1280         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1281         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1282         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1283
1284         write_lock_bh(&nf_conntrack_lock);
1285         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1286                 if (expect_matches(i, expect)) {
1287                         /* Refresh timer: if it's dying, ignore.. */
1288                         if (refresh_timer(i)) {
1289                                 ret = 0;
1290                                 goto out;
1291                         }
1292                 } else if (expect_clash(i, expect)) {
1293                         ret = -EBUSY;
1294                         goto out;
1295                 }
1296         }
1297         /* Will be over limit? */
1298         if (master_help->helper->max_expected &&
1299             master_help->expecting >= master_help->helper->max_expected)
1300                 evict_oldest_expect(master);
1301
1302         nf_conntrack_expect_insert(expect);
1303         nf_conntrack_expect_event(IPEXP_NEW, expect);
1304         ret = 0;
1305 out:
1306         write_unlock_bh(&nf_conntrack_lock);
1307         return ret;
1308 }
1309
1310 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1311 {
1312         int ret;
1313         BUG_ON(me->timeout == 0);
1314
1315         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1316                                           sizeof(struct nf_conn)
1317                                           + sizeof(struct nf_conn_help)
1318                                           + __alignof__(struct nf_conn_help));
1319         if (ret < 0) {
1320                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1321                 return ret;
1322         }
1323         write_lock_bh(&nf_conntrack_lock);
1324         list_prepend(&helpers, me);
1325         write_unlock_bh(&nf_conntrack_lock);
1326
1327         return 0;
1328 }
1329
1330 struct nf_conntrack_helper *
1331 __nf_conntrack_helper_find_byname(const char *name)
1332 {
1333         struct nf_conntrack_helper *h;
1334
1335         list_for_each_entry(h, &helpers, list) {
1336                 if (!strcmp(h->name, name))
1337                         return h;
1338         }
1339
1340         return NULL;
1341 }
1342
1343 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1344                          const struct nf_conntrack_helper *me)
1345 {
1346         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1347         struct nf_conn_help *help = nfct_help(ct);
1348
1349         if (help && help->helper == me) {
1350                 nf_conntrack_event(IPCT_HELPER, ct);
1351                 help->helper = NULL;
1352         }
1353         return 0;
1354 }
1355
1356 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1357 {
1358         unsigned int i;
1359         struct nf_conntrack_expect *exp, *tmp;
1360
1361         /* Need write lock here, to delete helper. */
1362         write_lock_bh(&nf_conntrack_lock);
1363         LIST_DELETE(&helpers, me);
1364
1365         /* Get rid of expectations */
1366         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1367                 struct nf_conn_help *help = nfct_help(exp->master);
1368                 if (help->helper == me && del_timer(&exp->timeout)) {
1369                         nf_ct_unlink_expect(exp);
1370                         nf_conntrack_expect_put(exp);
1371                 }
1372         }
1373
1374         /* Get rid of expecteds, set helpers to NULL. */
1375         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1376         for (i = 0; i < nf_conntrack_htable_size; i++)
1377                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1378                             struct nf_conntrack_tuple_hash *, me);
1379         write_unlock_bh(&nf_conntrack_lock);
1380
1381         /* Someone could be still looking at the helper in a bh. */
1382         synchronize_net();
1383 }
1384
1385 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1386 void __nf_ct_refresh_acct(struct nf_conn *ct,
1387                           enum ip_conntrack_info ctinfo,
1388                           const struct sk_buff *skb,
1389                           unsigned long extra_jiffies,
1390                           int do_acct)
1391 {
1392         int event = 0;
1393
1394         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1395         NF_CT_ASSERT(skb);
1396
1397         write_lock_bh(&nf_conntrack_lock);
1398
1399         /* Only update if this is not a fixed timeout */
1400         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1401                 write_unlock_bh(&nf_conntrack_lock);
1402                 return;
1403         }
1404
1405         /* If not in hash table, timer will not be active yet */
1406         if (!nf_ct_is_confirmed(ct)) {
1407                 ct->timeout.expires = extra_jiffies;
1408                 event = IPCT_REFRESH;
1409         } else {
1410                 /* Need del_timer for race avoidance (may already be dying). */
1411                 if (del_timer(&ct->timeout)) {
1412                         ct->timeout.expires = jiffies + extra_jiffies;
1413                         add_timer(&ct->timeout);
1414                         event = IPCT_REFRESH;
1415                 }
1416         }
1417
1418 #ifdef CONFIG_NF_CT_ACCT
1419         if (do_acct) {
1420                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1421                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1422                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1423         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1424             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1425                 event |= IPCT_COUNTER_FILLING;
1426         }
1427 #endif
1428
1429         write_unlock_bh(&nf_conntrack_lock);
1430
1431         /* must be unlocked when calling event cache */
1432         if (event)
1433                 nf_conntrack_event_cache(event, skb);
1434 }
1435
1436 #if defined(CONFIG_NF_CT_NETLINK) || \
1437     defined(CONFIG_NF_CT_NETLINK_MODULE)
1438
1439 #include <linux/netfilter/nfnetlink.h>
1440 #include <linux/netfilter/nfnetlink_conntrack.h>
1441 #include <linux/mutex.h>
1442
1443
1444 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1445  * in ip_conntrack_core, since we don't want the protocols to autoload
1446  * or depend on ctnetlink */
1447 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1448                                const struct nf_conntrack_tuple *tuple)
1449 {
1450         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1451                 &tuple->src.u.tcp.port);
1452         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1453                 &tuple->dst.u.tcp.port);
1454         return 0;
1455
1456 nfattr_failure:
1457         return -1;
1458 }
1459
1460 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1461         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1462         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1463 };
1464
1465 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1466                                struct nf_conntrack_tuple *t)
1467 {
1468         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1469                 return -EINVAL;
1470
1471         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1472                 return -EINVAL;
1473
1474         t->src.u.tcp.port =
1475                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1476         t->dst.u.tcp.port =
1477                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1478
1479         return 0;
1480 }
1481 #endif
1482
1483 /* Used by ipt_REJECT and ip6t_REJECT. */
1484 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1485 {
1486         struct nf_conn *ct;
1487         enum ip_conntrack_info ctinfo;
1488
1489         /* This ICMP is in reverse direction to the packet which caused it */
1490         ct = nf_ct_get(skb, &ctinfo);
1491         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1492                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1493         else
1494                 ctinfo = IP_CT_RELATED;
1495
1496         /* Attach to new skbuff, and increment count */
1497         nskb->nfct = &ct->ct_general;
1498         nskb->nfctinfo = ctinfo;
1499         nf_conntrack_get(nskb->nfct);
1500 }
1501
1502 static inline int
1503 do_iter(const struct nf_conntrack_tuple_hash *i,
1504         int (*iter)(struct nf_conn *i, void *data),
1505         void *data)
1506 {
1507         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1508 }
1509
1510 /* Bring out ya dead! */
1511 static struct nf_conntrack_tuple_hash *
1512 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1513                 void *data, unsigned int *bucket)
1514 {
1515         struct nf_conntrack_tuple_hash *h = NULL;
1516
1517         write_lock_bh(&nf_conntrack_lock);
1518         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1519                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1520                                 struct nf_conntrack_tuple_hash *, iter, data);
1521                 if (h)
1522                         break;
1523         }
1524         if (!h)
1525                 h = LIST_FIND_W(&unconfirmed, do_iter,
1526                                 struct nf_conntrack_tuple_hash *, iter, data);
1527         if (h)
1528                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1529         write_unlock_bh(&nf_conntrack_lock);
1530
1531         return h;
1532 }
1533
1534 void
1535 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1536 {
1537         struct nf_conntrack_tuple_hash *h;
1538         unsigned int bucket = 0;
1539
1540         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1541                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1542                 /* Time to push up daises... */
1543                 if (del_timer(&ct->timeout))
1544                         death_by_timeout((unsigned long)ct);
1545                 /* ... else the timer will get him soon. */
1546
1547                 nf_ct_put(ct);
1548         }
1549 }
1550
1551 static int kill_all(struct nf_conn *i, void *data)
1552 {
1553         return 1;
1554 }
1555
1556 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1557 {
1558         if (vmalloced)
1559                 vfree(hash);
1560         else
1561                 free_pages((unsigned long)hash, 
1562                            get_order(sizeof(struct list_head) * size));
1563 }
1564
1565 void nf_conntrack_flush()
1566 {
1567         nf_ct_iterate_cleanup(kill_all, NULL);
1568 }
1569
1570 /* Mishearing the voices in his head, our hero wonders how he's
1571    supposed to kill the mall. */
1572 void nf_conntrack_cleanup(void)
1573 {
1574         int i;
1575
1576         ip_ct_attach = NULL;
1577
1578         /* This makes sure all current packets have passed through
1579            netfilter framework.  Roll on, two-stage module
1580            delete... */
1581         synchronize_net();
1582
1583         nf_ct_event_cache_flush();
1584  i_see_dead_people:
1585         nf_conntrack_flush();
1586         if (atomic_read(&nf_conntrack_count) != 0) {
1587                 schedule();
1588                 goto i_see_dead_people;
1589         }
1590         /* wait until all references to nf_conntrack_untracked are dropped */
1591         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1592                 schedule();
1593
1594         for (i = 0; i < NF_CT_F_NUM; i++) {
1595                 if (nf_ct_cache[i].use == 0)
1596                         continue;
1597
1598                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1599                 nf_ct_cache[i].use = 1;
1600                 nf_conntrack_unregister_cache(i);
1601         }
1602         kmem_cache_destroy(nf_conntrack_expect_cachep);
1603         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1604                             nf_conntrack_htable_size);
1605
1606         /* free l3proto protocol tables */
1607         for (i = 0; i < PF_MAX; i++)
1608                 if (nf_ct_protos[i]) {
1609                         kfree(nf_ct_protos[i]);
1610                         nf_ct_protos[i] = NULL;
1611                 }
1612 }
1613
1614 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1615 {
1616         struct list_head *hash;
1617         unsigned int i;
1618
1619         *vmalloced = 0; 
1620         hash = (void*)__get_free_pages(GFP_KERNEL, 
1621                                        get_order(sizeof(struct list_head)
1622                                                  * size));
1623         if (!hash) { 
1624                 *vmalloced = 1;
1625                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1626                 hash = vmalloc(sizeof(struct list_head) * size);
1627         }
1628
1629         if (hash)
1630                 for (i = 0; i < size; i++) 
1631                         INIT_LIST_HEAD(&hash[i]);
1632
1633         return hash;
1634 }
1635
1636 int set_hashsize(const char *val, struct kernel_param *kp)
1637 {
1638         int i, bucket, hashsize, vmalloced;
1639         int old_vmalloced, old_size;
1640         int rnd;
1641         struct list_head *hash, *old_hash;
1642         struct nf_conntrack_tuple_hash *h;
1643
1644         /* On boot, we can set this without any fancy locking. */
1645         if (!nf_conntrack_htable_size)
1646                 return param_set_uint(val, kp);
1647
1648         hashsize = simple_strtol(val, NULL, 0);
1649         if (!hashsize)
1650                 return -EINVAL;
1651
1652         hash = alloc_hashtable(hashsize, &vmalloced);
1653         if (!hash)
1654                 return -ENOMEM;
1655
1656         /* We have to rehahs for the new table anyway, so we also can
1657          * use a newrandom seed */
1658         get_random_bytes(&rnd, 4);
1659
1660         write_lock_bh(&nf_conntrack_lock);
1661         for (i = 0; i < nf_conntrack_htable_size; i++) {
1662                 while (!list_empty(&nf_conntrack_hash[i])) {
1663                         h = list_entry(nf_conntrack_hash[i].next,
1664                                        struct nf_conntrack_tuple_hash, list);
1665                         list_del(&h->list);
1666                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1667                         list_add_tail(&h->list, &hash[bucket]);
1668                 }
1669         }
1670         old_size = nf_conntrack_htable_size;
1671         old_vmalloced = nf_conntrack_vmalloc;
1672         old_hash = nf_conntrack_hash;
1673
1674         nf_conntrack_htable_size = hashsize;
1675         nf_conntrack_vmalloc = vmalloced;
1676         nf_conntrack_hash = hash;
1677         nf_conntrack_hash_rnd = rnd;
1678         write_unlock_bh(&nf_conntrack_lock);
1679
1680         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1681         return 0;
1682 }
1683
1684 module_param_call(hashsize, set_hashsize, param_get_uint,
1685                   &nf_conntrack_htable_size, 0600);
1686
1687 int __init nf_conntrack_init(void)
1688 {
1689         unsigned int i;
1690         int ret;
1691
1692         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1693          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1694         if (!nf_conntrack_htable_size) {
1695                 nf_conntrack_htable_size
1696                         = (((num_physpages << PAGE_SHIFT) / 16384)
1697                            / sizeof(struct list_head));
1698                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1699                         nf_conntrack_htable_size = 8192;
1700                 if (nf_conntrack_htable_size < 16)
1701                         nf_conntrack_htable_size = 16;
1702         }
1703         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1704
1705         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1706                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1707                nf_conntrack_max);
1708
1709         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1710                                             &nf_conntrack_vmalloc);
1711         if (!nf_conntrack_hash) {
1712                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1713                 goto err_out;
1714         }
1715
1716         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1717                                           sizeof(struct nf_conn));
1718         if (ret < 0) {
1719                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1720                 goto err_free_hash;
1721         }
1722
1723         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1724                                         sizeof(struct nf_conntrack_expect),
1725                                         0, 0, NULL, NULL);
1726         if (!nf_conntrack_expect_cachep) {
1727                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1728                 goto err_free_conntrack_slab;
1729         }
1730
1731         /* Don't NEED lock here, but good form anyway. */
1732         write_lock_bh(&nf_conntrack_lock);
1733         for (i = 0; i < PF_MAX; i++)
1734                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1735         write_unlock_bh(&nf_conntrack_lock);
1736
1737         /* For use by REJECT target */
1738         ip_ct_attach = __nf_conntrack_attach;
1739
1740         /* Set up fake conntrack:
1741             - to never be deleted, not in any hashes */
1742         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1743         /*  - and look it like as a confirmed connection */
1744         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1745
1746         return ret;
1747
1748 err_free_conntrack_slab:
1749         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1750 err_free_hash:
1751         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1752                             nf_conntrack_htable_size);
1753 err_out:
1754         return -ENOMEM;
1755 }