]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - net/ipv4/tcp_ipv4.c
[INET]: Move the TCP hashtable functions/structs to inet_hashtables.[ch]
[linux-2.6-omap-h63xx.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 };
98
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 /* Caller must disable local BH processing. */
108 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
109 {
110         struct inet_bind_hashbucket *head =
111                                 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
112                                                         tcp_bhash_size)];
113         struct inet_bind_bucket *tb;
114
115         spin_lock(&head->lock);
116         tb = tcp_sk(sk)->bind_hash;
117         sk_add_bind_node(child, &tb->owners);
118         tcp_sk(child)->bind_hash = tb;
119         spin_unlock(&head->lock);
120 }
121
122 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
123 {
124         local_bh_disable();
125         __tcp_inherit_port(sk, child);
126         local_bh_enable();
127 }
128
129 void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130                    const unsigned short snum)
131 {
132         inet_sk(sk)->num = snum;
133         sk_add_bind_node(sk, &tb->owners);
134         tcp_sk(sk)->bind_hash = tb;
135 }
136
137 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
138 {
139         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
140         struct sock *sk2;
141         struct hlist_node *node;
142         int reuse = sk->sk_reuse;
143
144         sk_for_each_bound(sk2, node, &tb->owners) {
145                 if (sk != sk2 &&
146                     !tcp_v6_ipv6only(sk2) &&
147                     (!sk->sk_bound_dev_if ||
148                      !sk2->sk_bound_dev_if ||
149                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
150                         if (!reuse || !sk2->sk_reuse ||
151                             sk2->sk_state == TCP_LISTEN) {
152                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
153                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
154                                     sk2_rcv_saddr == sk_rcv_saddr)
155                                         break;
156                         }
157                 }
158         }
159         return node != NULL;
160 }
161
162 /* Obtain a reference to a local port for the given sock,
163  * if snum is zero it means select any available local port.
164  */
165 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
166 {
167         struct inet_bind_hashbucket *head;
168         struct hlist_node *node;
169         struct inet_bind_bucket *tb;
170         int ret;
171
172         local_bh_disable();
173         if (!snum) {
174                 int low = sysctl_local_port_range[0];
175                 int high = sysctl_local_port_range[1];
176                 int remaining = (high - low) + 1;
177                 int rover;
178
179                 spin_lock(&tcp_portalloc_lock);
180                 if (tcp_port_rover < low)
181                         rover = low;
182                 else
183                         rover = tcp_port_rover;
184                 do {
185                         rover++;
186                         if (rover > high)
187                                 rover = low;
188                         head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
189                         spin_lock(&head->lock);
190                         inet_bind_bucket_for_each(tb, node, &head->chain)
191                                 if (tb->port == rover)
192                                         goto next;
193                         break;
194                 next:
195                         spin_unlock(&head->lock);
196                 } while (--remaining > 0);
197                 tcp_port_rover = rover;
198                 spin_unlock(&tcp_portalloc_lock);
199
200                 /* Exhausted local port range during search?  It is not
201                  * possible for us to be holding one of the bind hash
202                  * locks if this test triggers, because if 'remaining'
203                  * drops to zero, we broke out of the do/while loop at
204                  * the top level, not from the 'break;' statement.
205                  */
206                 ret = 1;
207                 if (unlikely(remaining <= 0))
208                         goto fail;
209
210                 /* OK, here is the one we will use.  HEAD is
211                  * non-NULL and we hold it's mutex.
212                  */
213                 snum = rover;
214         } else {
215                 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
216                 spin_lock(&head->lock);
217                 inet_bind_bucket_for_each(tb, node, &head->chain)
218                         if (tb->port == snum)
219                                 goto tb_found;
220         }
221         tb = NULL;
222         goto tb_not_found;
223 tb_found:
224         if (!hlist_empty(&tb->owners)) {
225                 if (sk->sk_reuse > 1)
226                         goto success;
227                 if (tb->fastreuse > 0 &&
228                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
229                         goto success;
230                 } else {
231                         ret = 1;
232                         if (tcp_bind_conflict(sk, tb))
233                                 goto fail_unlock;
234                 }
235         }
236 tb_not_found:
237         ret = 1;
238         if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
239                 goto fail_unlock;
240         if (hlist_empty(&tb->owners)) {
241                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
242                         tb->fastreuse = 1;
243                 else
244                         tb->fastreuse = 0;
245         } else if (tb->fastreuse &&
246                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
247                 tb->fastreuse = 0;
248 success:
249         if (!tcp_sk(sk)->bind_hash)
250                 tcp_bind_hash(sk, tb, snum);
251         BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
252         ret = 0;
253
254 fail_unlock:
255         spin_unlock(&head->lock);
256 fail:
257         local_bh_enable();
258         return ret;
259 }
260
261 /* Get rid of any references to a local port held by the
262  * given sock.
263  */
264 static void __tcp_put_port(struct sock *sk)
265 {
266         struct inet_sock *inet = inet_sk(sk);
267         struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
268                                                                     tcp_bhash_size)];
269         struct inet_bind_bucket *tb;
270
271         spin_lock(&head->lock);
272         tb = tcp_sk(sk)->bind_hash;
273         __sk_del_bind_node(sk);
274         tcp_sk(sk)->bind_hash = NULL;
275         inet->num = 0;
276         inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
277         spin_unlock(&head->lock);
278 }
279
280 void tcp_put_port(struct sock *sk)
281 {
282         local_bh_disable();
283         __tcp_put_port(sk);
284         local_bh_enable();
285 }
286
287 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
288  * Look, when several writers sleep and reader wakes them up, all but one
289  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
290  * this, _but_ remember, it adds useless work on UP machines (wake up each
291  * exclusive lock release). It should be ifdefed really.
292  */
293
294 void tcp_listen_wlock(void)
295 {
296         write_lock(&tcp_lhash_lock);
297
298         if (atomic_read(&tcp_lhash_users)) {
299                 DEFINE_WAIT(wait);
300
301                 for (;;) {
302                         prepare_to_wait_exclusive(&tcp_lhash_wait,
303                                                 &wait, TASK_UNINTERRUPTIBLE);
304                         if (!atomic_read(&tcp_lhash_users))
305                                 break;
306                         write_unlock_bh(&tcp_lhash_lock);
307                         schedule();
308                         write_lock_bh(&tcp_lhash_lock);
309                 }
310
311                 finish_wait(&tcp_lhash_wait, &wait);
312         }
313 }
314
315 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
316 {
317         struct hlist_head *list;
318         rwlock_t *lock;
319
320         BUG_TRAP(sk_unhashed(sk));
321         if (listen_possible && sk->sk_state == TCP_LISTEN) {
322                 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
323                 lock = &tcp_lhash_lock;
324                 tcp_listen_wlock();
325         } else {
326                 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
327                 list = &tcp_ehash[sk->sk_hashent].chain;
328                 lock = &tcp_ehash[sk->sk_hashent].lock;
329                 write_lock(lock);
330         }
331         __sk_add_node(sk, list);
332         sock_prot_inc_use(sk->sk_prot);
333         write_unlock(lock);
334         if (listen_possible && sk->sk_state == TCP_LISTEN)
335                 wake_up(&tcp_lhash_wait);
336 }
337
338 static void tcp_v4_hash(struct sock *sk)
339 {
340         if (sk->sk_state != TCP_CLOSE) {
341                 local_bh_disable();
342                 __tcp_v4_hash(sk, 1);
343                 local_bh_enable();
344         }
345 }
346
347 void tcp_unhash(struct sock *sk)
348 {
349         rwlock_t *lock;
350
351         if (sk_unhashed(sk))
352                 goto ende;
353
354         if (sk->sk_state == TCP_LISTEN) {
355                 local_bh_disable();
356                 tcp_listen_wlock();
357                 lock = &tcp_lhash_lock;
358         } else {
359                 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
360                 lock = &head->lock;
361                 write_lock_bh(&head->lock);
362         }
363
364         if (__sk_del_node_init(sk))
365                 sock_prot_dec_use(sk->sk_prot);
366         write_unlock_bh(lock);
367
368  ende:
369         if (sk->sk_state == TCP_LISTEN)
370                 wake_up(&tcp_lhash_wait);
371 }
372
373 /* Don't inline this cruft.  Here are some nice properties to
374  * exploit here.  The BSD API does not allow a listening TCP
375  * to specify the remote port nor the remote address for the
376  * connection.  So always assume those are both wildcarded
377  * during the search since they can never be otherwise.
378  */
379 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
380                                              const u32 daddr,
381                                              const unsigned short hnum,
382                                              const int dif)
383 {
384         struct sock *result = NULL, *sk;
385         struct hlist_node *node;
386         int score, hiscore;
387
388         hiscore=-1;
389         sk_for_each(sk, node, head) {
390                 struct inet_sock *inet = inet_sk(sk);
391
392                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
393                         __u32 rcv_saddr = inet->rcv_saddr;
394
395                         score = (sk->sk_family == PF_INET ? 1 : 0);
396                         if (rcv_saddr) {
397                                 if (rcv_saddr != daddr)
398                                         continue;
399                                 score+=2;
400                         }
401                         if (sk->sk_bound_dev_if) {
402                                 if (sk->sk_bound_dev_if != dif)
403                                         continue;
404                                 score+=2;
405                         }
406                         if (score == 5)
407                                 return sk;
408                         if (score > hiscore) {
409                                 hiscore = score;
410                                 result = sk;
411                         }
412                 }
413         }
414         return result;
415 }
416
417 /* Optimize the common listener case. */
418 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
419                                                   const unsigned short hnum,
420                                                   const int dif)
421 {
422         struct sock *sk = NULL;
423         struct hlist_head *head;
424
425         read_lock(&tcp_lhash_lock);
426         head = &tcp_listening_hash[inet_lhashfn(hnum)];
427         if (!hlist_empty(head)) {
428                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
429
430                 if (inet->num == hnum && !sk->sk_node.next &&
431                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
432                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
433                     !sk->sk_bound_dev_if)
434                         goto sherry_cache;
435                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
436         }
437         if (sk) {
438 sherry_cache:
439                 sock_hold(sk);
440         }
441         read_unlock(&tcp_lhash_lock);
442         return sk;
443 }
444
445 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
446  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
447  *
448  * Local BH must be disabled here.
449  */
450
451 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
452                                                        const u16 sport,
453                                                        const u32 daddr,
454                                                        const u16 hnum,
455                                                        const int dif)
456 {
457         struct inet_ehash_bucket *head;
458         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
459         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
460         struct sock *sk;
461         struct hlist_node *node;
462         /* Optimize here for direct hit, only listening connections can
463          * have wildcards anyways.
464          */
465         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
466         head = &tcp_ehash[hash];
467         read_lock(&head->lock);
468         sk_for_each(sk, node, &head->chain) {
469                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
470                         goto hit; /* You sunk my battleship! */
471         }
472
473         /* Must check for a TIME_WAIT'er before going to listener hash. */
474         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
475                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
476                         goto hit;
477         }
478         sk = NULL;
479 out:
480         read_unlock(&head->lock);
481         return sk;
482 hit:
483         sock_hold(sk);
484         goto out;
485 }
486
487 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
488                                            u32 daddr, u16 hnum, int dif)
489 {
490         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
491                                                       daddr, hnum, dif);
492
493         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
494 }
495
496 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
497                                   u16 dport, int dif)
498 {
499         struct sock *sk;
500
501         local_bh_disable();
502         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
503         local_bh_enable();
504
505         return sk;
506 }
507
508 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
509
510 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
511 {
512         return secure_tcp_sequence_number(skb->nh.iph->daddr,
513                                           skb->nh.iph->saddr,
514                                           skb->h.th->dest,
515                                           skb->h.th->source);
516 }
517
518 /* called with local bh disabled */
519 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
520                                       struct tcp_tw_bucket **twp)
521 {
522         struct inet_sock *inet = inet_sk(sk);
523         u32 daddr = inet->rcv_saddr;
524         u32 saddr = inet->daddr;
525         int dif = sk->sk_bound_dev_if;
526         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
527         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
528         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
529         struct inet_ehash_bucket *head = &tcp_ehash[hash];
530         struct sock *sk2;
531         struct hlist_node *node;
532         struct tcp_tw_bucket *tw;
533
534         write_lock(&head->lock);
535
536         /* Check TIME-WAIT sockets first. */
537         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
538                 tw = (struct tcp_tw_bucket *)sk2;
539
540                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
541                         struct tcp_sock *tp = tcp_sk(sk);
542
543                         /* With PAWS, it is safe from the viewpoint
544                            of data integrity. Even without PAWS it
545                            is safe provided sequence spaces do not
546                            overlap i.e. at data rates <= 80Mbit/sec.
547
548                            Actually, the idea is close to VJ's one,
549                            only timestamp cache is held not per host,
550                            but per port pair and TW bucket is used
551                            as state holder.
552
553                            If TW bucket has been already destroyed we
554                            fall back to VJ's scheme and use initial
555                            timestamp retrieved from peer table.
556                          */
557                         if (tw->tw_ts_recent_stamp &&
558                             (!twp || (sysctl_tcp_tw_reuse &&
559                                       xtime.tv_sec -
560                                       tw->tw_ts_recent_stamp > 1))) {
561                                 if ((tp->write_seq =
562                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
563                                         tp->write_seq = 1;
564                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
565                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
566                                 sock_hold(sk2);
567                                 goto unique;
568                         } else
569                                 goto not_unique;
570                 }
571         }
572         tw = NULL;
573
574         /* And established part... */
575         sk_for_each(sk2, node, &head->chain) {
576                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
577                         goto not_unique;
578         }
579
580 unique:
581         /* Must record num and sport now. Otherwise we will see
582          * in hash table socket with a funny identity. */
583         inet->num = lport;
584         inet->sport = htons(lport);
585         sk->sk_hashent = hash;
586         BUG_TRAP(sk_unhashed(sk));
587         __sk_add_node(sk, &head->chain);
588         sock_prot_inc_use(sk->sk_prot);
589         write_unlock(&head->lock);
590
591         if (twp) {
592                 *twp = tw;
593                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
594         } else if (tw) {
595                 /* Silly. Should hash-dance instead... */
596                 tcp_tw_deschedule(tw);
597                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
598
599                 tcp_tw_put(tw);
600         }
601
602         return 0;
603
604 not_unique:
605         write_unlock(&head->lock);
606         return -EADDRNOTAVAIL;
607 }
608
609 static inline u32 connect_port_offset(const struct sock *sk)
610 {
611         const struct inet_sock *inet = inet_sk(sk);
612
613         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
614                                          inet->dport);
615 }
616
617 /*
618  * Bind a port for a connect operation and hash it.
619  */
620 static inline int tcp_v4_hash_connect(struct sock *sk)
621 {
622         const unsigned short snum = inet_sk(sk)->num;
623         struct inet_bind_hashbucket *head;
624         struct inet_bind_bucket *tb;
625         int ret;
626
627         if (!snum) {
628                 int low = sysctl_local_port_range[0];
629                 int high = sysctl_local_port_range[1];
630                 int range = high - low;
631                 int i;
632                 int port;
633                 static u32 hint;
634                 u32 offset = hint + connect_port_offset(sk);
635                 struct hlist_node *node;
636                 struct tcp_tw_bucket *tw = NULL;
637
638                 local_bh_disable();
639                 for (i = 1; i <= range; i++) {
640                         port = low + (i + offset) % range;
641                         head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
642                         spin_lock(&head->lock);
643
644                         /* Does not bother with rcv_saddr checks,
645                          * because the established check is already
646                          * unique enough.
647                          */
648                         inet_bind_bucket_for_each(tb, node, &head->chain) {
649                                 if (tb->port == port) {
650                                         BUG_TRAP(!hlist_empty(&tb->owners));
651                                         if (tb->fastreuse >= 0)
652                                                 goto next_port;
653                                         if (!__tcp_v4_check_established(sk,
654                                                                         port,
655                                                                         &tw))
656                                                 goto ok;
657                                         goto next_port;
658                                 }
659                         }
660
661                         tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
662                         if (!tb) {
663                                 spin_unlock(&head->lock);
664                                 break;
665                         }
666                         tb->fastreuse = -1;
667                         goto ok;
668
669                 next_port:
670                         spin_unlock(&head->lock);
671                 }
672                 local_bh_enable();
673
674                 return -EADDRNOTAVAIL;
675
676 ok:
677                 hint += i;
678
679                 /* Head lock still held and bh's disabled */
680                 tcp_bind_hash(sk, tb, port);
681                 if (sk_unhashed(sk)) {
682                         inet_sk(sk)->sport = htons(port);
683                         __tcp_v4_hash(sk, 0);
684                 }
685                 spin_unlock(&head->lock);
686
687                 if (tw) {
688                         tcp_tw_deschedule(tw);
689                         tcp_tw_put(tw);
690                 }
691
692                 ret = 0;
693                 goto out;
694         }
695
696         head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
697         tb  = tcp_sk(sk)->bind_hash;
698         spin_lock_bh(&head->lock);
699         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
700                 __tcp_v4_hash(sk, 0);
701                 spin_unlock_bh(&head->lock);
702                 return 0;
703         } else {
704                 spin_unlock(&head->lock);
705                 /* No definite answer... Walk to established hash table */
706                 ret = __tcp_v4_check_established(sk, snum, NULL);
707 out:
708                 local_bh_enable();
709                 return ret;
710         }
711 }
712
713 /* This will initiate an outgoing connection. */
714 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
715 {
716         struct inet_sock *inet = inet_sk(sk);
717         struct tcp_sock *tp = tcp_sk(sk);
718         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
719         struct rtable *rt;
720         u32 daddr, nexthop;
721         int tmp;
722         int err;
723
724         if (addr_len < sizeof(struct sockaddr_in))
725                 return -EINVAL;
726
727         if (usin->sin_family != AF_INET)
728                 return -EAFNOSUPPORT;
729
730         nexthop = daddr = usin->sin_addr.s_addr;
731         if (inet->opt && inet->opt->srr) {
732                 if (!daddr)
733                         return -EINVAL;
734                 nexthop = inet->opt->faddr;
735         }
736
737         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
738                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
739                                IPPROTO_TCP,
740                                inet->sport, usin->sin_port, sk);
741         if (tmp < 0)
742                 return tmp;
743
744         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
745                 ip_rt_put(rt);
746                 return -ENETUNREACH;
747         }
748
749         if (!inet->opt || !inet->opt->srr)
750                 daddr = rt->rt_dst;
751
752         if (!inet->saddr)
753                 inet->saddr = rt->rt_src;
754         inet->rcv_saddr = inet->saddr;
755
756         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
757                 /* Reset inherited state */
758                 tp->rx_opt.ts_recent       = 0;
759                 tp->rx_opt.ts_recent_stamp = 0;
760                 tp->write_seq              = 0;
761         }
762
763         if (sysctl_tcp_tw_recycle &&
764             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
765                 struct inet_peer *peer = rt_get_peer(rt);
766
767                 /* VJ's idea. We save last timestamp seen from
768                  * the destination in peer table, when entering state TIME-WAIT
769                  * and initialize rx_opt.ts_recent from it, when trying new connection.
770                  */
771
772                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
773                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
774                         tp->rx_opt.ts_recent = peer->tcp_ts;
775                 }
776         }
777
778         inet->dport = usin->sin_port;
779         inet->daddr = daddr;
780
781         tp->ext_header_len = 0;
782         if (inet->opt)
783                 tp->ext_header_len = inet->opt->optlen;
784
785         tp->rx_opt.mss_clamp = 536;
786
787         /* Socket identity is still unknown (sport may be zero).
788          * However we set state to SYN-SENT and not releasing socket
789          * lock select source port, enter ourselves into the hash tables and
790          * complete initialization after this.
791          */
792         tcp_set_state(sk, TCP_SYN_SENT);
793         err = tcp_v4_hash_connect(sk);
794         if (err)
795                 goto failure;
796
797         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
798         if (err)
799                 goto failure;
800
801         /* OK, now commit destination to socket.  */
802         sk_setup_caps(sk, &rt->u.dst);
803
804         if (!tp->write_seq)
805                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
806                                                            inet->daddr,
807                                                            inet->sport,
808                                                            usin->sin_port);
809
810         inet->id = tp->write_seq ^ jiffies;
811
812         err = tcp_connect(sk);
813         rt = NULL;
814         if (err)
815                 goto failure;
816
817         return 0;
818
819 failure:
820         /* This unhashes the socket and releases the local port, if necessary. */
821         tcp_set_state(sk, TCP_CLOSE);
822         ip_rt_put(rt);
823         sk->sk_route_caps = 0;
824         inet->dport = 0;
825         return err;
826 }
827
828 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
829 {
830         return ((struct rtable *)skb->dst)->rt_iif;
831 }
832
833 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
834 {
835         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
836 }
837
838 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
839                                               struct request_sock ***prevp,
840                                               __u16 rport,
841                                               __u32 raddr, __u32 laddr)
842 {
843         struct listen_sock *lopt = tp->accept_queue.listen_opt;
844         struct request_sock *req, **prev;
845
846         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
847              (req = *prev) != NULL;
848              prev = &req->dl_next) {
849                 const struct inet_request_sock *ireq = inet_rsk(req);
850
851                 if (ireq->rmt_port == rport &&
852                     ireq->rmt_addr == raddr &&
853                     ireq->loc_addr == laddr &&
854                     TCP_INET_FAMILY(req->rsk_ops->family)) {
855                         BUG_TRAP(!req->sk);
856                         *prevp = prev;
857                         break;
858                 }
859         }
860
861         return req;
862 }
863
864 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
865 {
866         struct tcp_sock *tp = tcp_sk(sk);
867         struct listen_sock *lopt = tp->accept_queue.listen_opt;
868         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
869
870         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
871         tcp_synq_added(sk);
872 }
873
874
875 /*
876  * This routine does path mtu discovery as defined in RFC1191.
877  */
878 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
879                                      u32 mtu)
880 {
881         struct dst_entry *dst;
882         struct inet_sock *inet = inet_sk(sk);
883         struct tcp_sock *tp = tcp_sk(sk);
884
885         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
886          * send out by Linux are always <576bytes so they should go through
887          * unfragmented).
888          */
889         if (sk->sk_state == TCP_LISTEN)
890                 return;
891
892         /* We don't check in the destentry if pmtu discovery is forbidden
893          * on this route. We just assume that no packet_to_big packets
894          * are send back when pmtu discovery is not active.
895          * There is a small race when the user changes this flag in the
896          * route, but I think that's acceptable.
897          */
898         if ((dst = __sk_dst_check(sk, 0)) == NULL)
899                 return;
900
901         dst->ops->update_pmtu(dst, mtu);
902
903         /* Something is about to be wrong... Remember soft error
904          * for the case, if this connection will not able to recover.
905          */
906         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
907                 sk->sk_err_soft = EMSGSIZE;
908
909         mtu = dst_mtu(dst);
910
911         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
912             tp->pmtu_cookie > mtu) {
913                 tcp_sync_mss(sk, mtu);
914
915                 /* Resend the TCP packet because it's
916                  * clear that the old packet has been
917                  * dropped. This is the new "fast" path mtu
918                  * discovery.
919                  */
920                 tcp_simple_retransmit(sk);
921         } /* else let the usual retransmit timer handle it */
922 }
923
924 /*
925  * This routine is called by the ICMP module when it gets some
926  * sort of error condition.  If err < 0 then the socket should
927  * be closed and the error returned to the user.  If err > 0
928  * it's just the icmp type << 8 | icmp code.  After adjustment
929  * header points to the first 8 bytes of the tcp header.  We need
930  * to find the appropriate port.
931  *
932  * The locking strategy used here is very "optimistic". When
933  * someone else accesses the socket the ICMP is just dropped
934  * and for some paths there is no check at all.
935  * A more general error queue to queue errors for later handling
936  * is probably better.
937  *
938  */
939
940 void tcp_v4_err(struct sk_buff *skb, u32 info)
941 {
942         struct iphdr *iph = (struct iphdr *)skb->data;
943         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
944         struct tcp_sock *tp;
945         struct inet_sock *inet;
946         int type = skb->h.icmph->type;
947         int code = skb->h.icmph->code;
948         struct sock *sk;
949         __u32 seq;
950         int err;
951
952         if (skb->len < (iph->ihl << 2) + 8) {
953                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
954                 return;
955         }
956
957         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
958                            th->source, tcp_v4_iif(skb));
959         if (!sk) {
960                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
961                 return;
962         }
963         if (sk->sk_state == TCP_TIME_WAIT) {
964                 tcp_tw_put((struct tcp_tw_bucket *)sk);
965                 return;
966         }
967
968         bh_lock_sock(sk);
969         /* If too many ICMPs get dropped on busy
970          * servers this needs to be solved differently.
971          */
972         if (sock_owned_by_user(sk))
973                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
974
975         if (sk->sk_state == TCP_CLOSE)
976                 goto out;
977
978         tp = tcp_sk(sk);
979         seq = ntohl(th->seq);
980         if (sk->sk_state != TCP_LISTEN &&
981             !between(seq, tp->snd_una, tp->snd_nxt)) {
982                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
983                 goto out;
984         }
985
986         switch (type) {
987         case ICMP_SOURCE_QUENCH:
988                 /* Just silently ignore these. */
989                 goto out;
990         case ICMP_PARAMETERPROB:
991                 err = EPROTO;
992                 break;
993         case ICMP_DEST_UNREACH:
994                 if (code > NR_ICMP_UNREACH)
995                         goto out;
996
997                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
998                         if (!sock_owned_by_user(sk))
999                                 do_pmtu_discovery(sk, iph, info);
1000                         goto out;
1001                 }
1002
1003                 err = icmp_err_convert[code].errno;
1004                 break;
1005         case ICMP_TIME_EXCEEDED:
1006                 err = EHOSTUNREACH;
1007                 break;
1008         default:
1009                 goto out;
1010         }
1011
1012         switch (sk->sk_state) {
1013                 struct request_sock *req, **prev;
1014         case TCP_LISTEN:
1015                 if (sock_owned_by_user(sk))
1016                         goto out;
1017
1018                 req = tcp_v4_search_req(tp, &prev, th->dest,
1019                                         iph->daddr, iph->saddr);
1020                 if (!req)
1021                         goto out;
1022
1023                 /* ICMPs are not backlogged, hence we cannot get
1024                    an established socket here.
1025                  */
1026                 BUG_TRAP(!req->sk);
1027
1028                 if (seq != tcp_rsk(req)->snt_isn) {
1029                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1030                         goto out;
1031                 }
1032
1033                 /*
1034                  * Still in SYN_RECV, just remove it silently.
1035                  * There is no good way to pass the error to the newly
1036                  * created socket, and POSIX does not want network
1037                  * errors returned from accept().
1038                  */
1039                 tcp_synq_drop(sk, req, prev);
1040                 goto out;
1041
1042         case TCP_SYN_SENT:
1043         case TCP_SYN_RECV:  /* Cannot happen.
1044                                It can f.e. if SYNs crossed.
1045                              */
1046                 if (!sock_owned_by_user(sk)) {
1047                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1048                         sk->sk_err = err;
1049
1050                         sk->sk_error_report(sk);
1051
1052                         tcp_done(sk);
1053                 } else {
1054                         sk->sk_err_soft = err;
1055                 }
1056                 goto out;
1057         }
1058
1059         /* If we've already connected we will keep trying
1060          * until we time out, or the user gives up.
1061          *
1062          * rfc1122 4.2.3.9 allows to consider as hard errors
1063          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1064          * but it is obsoleted by pmtu discovery).
1065          *
1066          * Note, that in modern internet, where routing is unreliable
1067          * and in each dark corner broken firewalls sit, sending random
1068          * errors ordered by their masters even this two messages finally lose
1069          * their original sense (even Linux sends invalid PORT_UNREACHs)
1070          *
1071          * Now we are in compliance with RFCs.
1072          *                                                      --ANK (980905)
1073          */
1074
1075         inet = inet_sk(sk);
1076         if (!sock_owned_by_user(sk) && inet->recverr) {
1077                 sk->sk_err = err;
1078                 sk->sk_error_report(sk);
1079         } else  { /* Only an error on timeout */
1080                 sk->sk_err_soft = err;
1081         }
1082
1083 out:
1084         bh_unlock_sock(sk);
1085         sock_put(sk);
1086 }
1087
1088 /* This routine computes an IPv4 TCP checksum. */
1089 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1090                        struct sk_buff *skb)
1091 {
1092         struct inet_sock *inet = inet_sk(sk);
1093
1094         if (skb->ip_summed == CHECKSUM_HW) {
1095                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1096                 skb->csum = offsetof(struct tcphdr, check);
1097         } else {
1098                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1099                                          csum_partial((char *)th,
1100                                                       th->doff << 2,
1101                                                       skb->csum));
1102         }
1103 }
1104
1105 /*
1106  *      This routine will send an RST to the other tcp.
1107  *
1108  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1109  *                    for reset.
1110  *      Answer: if a packet caused RST, it is not for a socket
1111  *              existing in our system, if it is matched to a socket,
1112  *              it is just duplicate segment or bug in other side's TCP.
1113  *              So that we build reply only basing on parameters
1114  *              arrived with segment.
1115  *      Exception: precedence violation. We do not implement it in any case.
1116  */
1117
1118 static void tcp_v4_send_reset(struct sk_buff *skb)
1119 {
1120         struct tcphdr *th = skb->h.th;
1121         struct tcphdr rth;
1122         struct ip_reply_arg arg;
1123
1124         /* Never send a reset in response to a reset. */
1125         if (th->rst)
1126                 return;
1127
1128         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1129                 return;
1130
1131         /* Swap the send and the receive. */
1132         memset(&rth, 0, sizeof(struct tcphdr));
1133         rth.dest   = th->source;
1134         rth.source = th->dest;
1135         rth.doff   = sizeof(struct tcphdr) / 4;
1136         rth.rst    = 1;
1137
1138         if (th->ack) {
1139                 rth.seq = th->ack_seq;
1140         } else {
1141                 rth.ack = 1;
1142                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1143                                     skb->len - (th->doff << 2));
1144         }
1145
1146         memset(&arg, 0, sizeof arg);
1147         arg.iov[0].iov_base = (unsigned char *)&rth;
1148         arg.iov[0].iov_len  = sizeof rth;
1149         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1150                                       skb->nh.iph->saddr, /*XXX*/
1151                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1152         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1153
1154         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1155
1156         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1157         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1158 }
1159
1160 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1161    outside socket context is ugly, certainly. What can I do?
1162  */
1163
1164 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1165                             u32 win, u32 ts)
1166 {
1167         struct tcphdr *th = skb->h.th;
1168         struct {
1169                 struct tcphdr th;
1170                 u32 tsopt[3];
1171         } rep;
1172         struct ip_reply_arg arg;
1173
1174         memset(&rep.th, 0, sizeof(struct tcphdr));
1175         memset(&arg, 0, sizeof arg);
1176
1177         arg.iov[0].iov_base = (unsigned char *)&rep;
1178         arg.iov[0].iov_len  = sizeof(rep.th);
1179         if (ts) {
1180                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1181                                      (TCPOPT_TIMESTAMP << 8) |
1182                                      TCPOLEN_TIMESTAMP);
1183                 rep.tsopt[1] = htonl(tcp_time_stamp);
1184                 rep.tsopt[2] = htonl(ts);
1185                 arg.iov[0].iov_len = sizeof(rep);
1186         }
1187
1188         /* Swap the send and the receive. */
1189         rep.th.dest    = th->source;
1190         rep.th.source  = th->dest;
1191         rep.th.doff    = arg.iov[0].iov_len / 4;
1192         rep.th.seq     = htonl(seq);
1193         rep.th.ack_seq = htonl(ack);
1194         rep.th.ack     = 1;
1195         rep.th.window  = htons(win);
1196
1197         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1198                                       skb->nh.iph->saddr, /*XXX*/
1199                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1200         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1201
1202         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1203
1204         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1205 }
1206
1207 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1208 {
1209         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1210
1211         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1212                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1213
1214         tcp_tw_put(tw);
1215 }
1216
1217 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1218 {
1219         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1220                         req->ts_recent);
1221 }
1222
1223 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1224                                           struct request_sock *req)
1225 {
1226         struct rtable *rt;
1227         const struct inet_request_sock *ireq = inet_rsk(req);
1228         struct ip_options *opt = inet_rsk(req)->opt;
1229         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1230                             .nl_u = { .ip4_u =
1231                                       { .daddr = ((opt && opt->srr) ?
1232                                                   opt->faddr :
1233                                                   ireq->rmt_addr),
1234                                         .saddr = ireq->loc_addr,
1235                                         .tos = RT_CONN_FLAGS(sk) } },
1236                             .proto = IPPROTO_TCP,
1237                             .uli_u = { .ports =
1238                                        { .sport = inet_sk(sk)->sport,
1239                                          .dport = ireq->rmt_port } } };
1240
1241         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1242                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1243                 return NULL;
1244         }
1245         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1246                 ip_rt_put(rt);
1247                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1248                 return NULL;
1249         }
1250         return &rt->u.dst;
1251 }
1252
1253 /*
1254  *      Send a SYN-ACK after having received an ACK.
1255  *      This still operates on a request_sock only, not on a big
1256  *      socket.
1257  */
1258 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1259                               struct dst_entry *dst)
1260 {
1261         const struct inet_request_sock *ireq = inet_rsk(req);
1262         int err = -1;
1263         struct sk_buff * skb;
1264
1265         /* First, grab a route. */
1266         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1267                 goto out;
1268
1269         skb = tcp_make_synack(sk, dst, req);
1270
1271         if (skb) {
1272                 struct tcphdr *th = skb->h.th;
1273
1274                 th->check = tcp_v4_check(th, skb->len,
1275                                          ireq->loc_addr,
1276                                          ireq->rmt_addr,
1277                                          csum_partial((char *)th, skb->len,
1278                                                       skb->csum));
1279
1280                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1281                                             ireq->rmt_addr,
1282                                             ireq->opt);
1283                 if (err == NET_XMIT_CN)
1284                         err = 0;
1285         }
1286
1287 out:
1288         dst_release(dst);
1289         return err;
1290 }
1291
1292 /*
1293  *      IPv4 request_sock destructor.
1294  */
1295 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1296 {
1297         if (inet_rsk(req)->opt)
1298                 kfree(inet_rsk(req)->opt);
1299 }
1300
1301 static inline void syn_flood_warning(struct sk_buff *skb)
1302 {
1303         static unsigned long warntime;
1304
1305         if (time_after(jiffies, (warntime + HZ * 60))) {
1306                 warntime = jiffies;
1307                 printk(KERN_INFO
1308                        "possible SYN flooding on port %d. Sending cookies.\n",
1309                        ntohs(skb->h.th->dest));
1310         }
1311 }
1312
1313 /*
1314  * Save and compile IPv4 options into the request_sock if needed.
1315  */
1316 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1317                                                      struct sk_buff *skb)
1318 {
1319         struct ip_options *opt = &(IPCB(skb)->opt);
1320         struct ip_options *dopt = NULL;
1321
1322         if (opt && opt->optlen) {
1323                 int opt_size = optlength(opt);
1324                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1325                 if (dopt) {
1326                         if (ip_options_echo(dopt, skb)) {
1327                                 kfree(dopt);
1328                                 dopt = NULL;
1329                         }
1330                 }
1331         }
1332         return dopt;
1333 }
1334
1335 struct request_sock_ops tcp_request_sock_ops = {
1336         .family         =       PF_INET,
1337         .obj_size       =       sizeof(struct tcp_request_sock),
1338         .rtx_syn_ack    =       tcp_v4_send_synack,
1339         .send_ack       =       tcp_v4_reqsk_send_ack,
1340         .destructor     =       tcp_v4_reqsk_destructor,
1341         .send_reset     =       tcp_v4_send_reset,
1342 };
1343
1344 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1345 {
1346         struct inet_request_sock *ireq;
1347         struct tcp_options_received tmp_opt;
1348         struct request_sock *req;
1349         __u32 saddr = skb->nh.iph->saddr;
1350         __u32 daddr = skb->nh.iph->daddr;
1351         __u32 isn = TCP_SKB_CB(skb)->when;
1352         struct dst_entry *dst = NULL;
1353 #ifdef CONFIG_SYN_COOKIES
1354         int want_cookie = 0;
1355 #else
1356 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1357 #endif
1358
1359         /* Never answer to SYNs send to broadcast or multicast */
1360         if (((struct rtable *)skb->dst)->rt_flags &
1361             (RTCF_BROADCAST | RTCF_MULTICAST))
1362                 goto drop;
1363
1364         /* TW buckets are converted to open requests without
1365          * limitations, they conserve resources and peer is
1366          * evidently real one.
1367          */
1368         if (tcp_synq_is_full(sk) && !isn) {
1369 #ifdef CONFIG_SYN_COOKIES
1370                 if (sysctl_tcp_syncookies) {
1371                         want_cookie = 1;
1372                 } else
1373 #endif
1374                 goto drop;
1375         }
1376
1377         /* Accept backlog is full. If we have already queued enough
1378          * of warm entries in syn queue, drop request. It is better than
1379          * clogging syn queue with openreqs with exponentially increasing
1380          * timeout.
1381          */
1382         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1383                 goto drop;
1384
1385         req = reqsk_alloc(&tcp_request_sock_ops);
1386         if (!req)
1387                 goto drop;
1388
1389         tcp_clear_options(&tmp_opt);
1390         tmp_opt.mss_clamp = 536;
1391         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1392
1393         tcp_parse_options(skb, &tmp_opt, 0);
1394
1395         if (want_cookie) {
1396                 tcp_clear_options(&tmp_opt);
1397                 tmp_opt.saw_tstamp = 0;
1398         }
1399
1400         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1401                 /* Some OSes (unknown ones, but I see them on web server, which
1402                  * contains information interesting only for windows'
1403                  * users) do not send their stamp in SYN. It is easy case.
1404                  * We simply do not advertise TS support.
1405                  */
1406                 tmp_opt.saw_tstamp = 0;
1407                 tmp_opt.tstamp_ok  = 0;
1408         }
1409         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1410
1411         tcp_openreq_init(req, &tmp_opt, skb);
1412
1413         ireq = inet_rsk(req);
1414         ireq->loc_addr = daddr;
1415         ireq->rmt_addr = saddr;
1416         ireq->opt = tcp_v4_save_options(sk, skb);
1417         if (!want_cookie)
1418                 TCP_ECN_create_request(req, skb->h.th);
1419
1420         if (want_cookie) {
1421 #ifdef CONFIG_SYN_COOKIES
1422                 syn_flood_warning(skb);
1423 #endif
1424                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1425         } else if (!isn) {
1426                 struct inet_peer *peer = NULL;
1427
1428                 /* VJ's idea. We save last timestamp seen
1429                  * from the destination in peer table, when entering
1430                  * state TIME-WAIT, and check against it before
1431                  * accepting new connection request.
1432                  *
1433                  * If "isn" is not zero, this request hit alive
1434                  * timewait bucket, so that all the necessary checks
1435                  * are made in the function processing timewait state.
1436                  */
1437                 if (tmp_opt.saw_tstamp &&
1438                     sysctl_tcp_tw_recycle &&
1439                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1440                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1441                     peer->v4daddr == saddr) {
1442                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1443                             (s32)(peer->tcp_ts - req->ts_recent) >
1444                                                         TCP_PAWS_WINDOW) {
1445                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1446                                 dst_release(dst);
1447                                 goto drop_and_free;
1448                         }
1449                 }
1450                 /* Kill the following clause, if you dislike this way. */
1451                 else if (!sysctl_tcp_syncookies &&
1452                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1453                           (sysctl_max_syn_backlog >> 2)) &&
1454                          (!peer || !peer->tcp_ts_stamp) &&
1455                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1456                         /* Without syncookies last quarter of
1457                          * backlog is filled with destinations,
1458                          * proven to be alive.
1459                          * It means that we continue to communicate
1460                          * to destinations, already remembered
1461                          * to the moment of synflood.
1462                          */
1463                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1464                                               "request from %u.%u."
1465                                               "%u.%u/%u\n",
1466                                               NIPQUAD(saddr),
1467                                               ntohs(skb->h.th->source)));
1468                         dst_release(dst);
1469                         goto drop_and_free;
1470                 }
1471
1472                 isn = tcp_v4_init_sequence(sk, skb);
1473         }
1474         tcp_rsk(req)->snt_isn = isn;
1475
1476         if (tcp_v4_send_synack(sk, req, dst))
1477                 goto drop_and_free;
1478
1479         if (want_cookie) {
1480                 reqsk_free(req);
1481         } else {
1482                 tcp_v4_synq_add(sk, req);
1483         }
1484         return 0;
1485
1486 drop_and_free:
1487         reqsk_free(req);
1488 drop:
1489         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1490         return 0;
1491 }
1492
1493
1494 /*
1495  * The three way handshake has completed - we got a valid synack -
1496  * now create the new socket.
1497  */
1498 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1499                                   struct request_sock *req,
1500                                   struct dst_entry *dst)
1501 {
1502         struct inet_request_sock *ireq;
1503         struct inet_sock *newinet;
1504         struct tcp_sock *newtp;
1505         struct sock *newsk;
1506
1507         if (sk_acceptq_is_full(sk))
1508                 goto exit_overflow;
1509
1510         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1511                 goto exit;
1512
1513         newsk = tcp_create_openreq_child(sk, req, skb);
1514         if (!newsk)
1515                 goto exit;
1516
1517         sk_setup_caps(newsk, dst);
1518
1519         newtp                 = tcp_sk(newsk);
1520         newinet               = inet_sk(newsk);
1521         ireq                  = inet_rsk(req);
1522         newinet->daddr        = ireq->rmt_addr;
1523         newinet->rcv_saddr    = ireq->loc_addr;
1524         newinet->saddr        = ireq->loc_addr;
1525         newinet->opt          = ireq->opt;
1526         ireq->opt             = NULL;
1527         newinet->mc_index     = tcp_v4_iif(skb);
1528         newinet->mc_ttl       = skb->nh.iph->ttl;
1529         newtp->ext_header_len = 0;
1530         if (newinet->opt)
1531                 newtp->ext_header_len = newinet->opt->optlen;
1532         newinet->id = newtp->write_seq ^ jiffies;
1533
1534         tcp_sync_mss(newsk, dst_mtu(dst));
1535         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1536         tcp_initialize_rcv_mss(newsk);
1537
1538         __tcp_v4_hash(newsk, 0);
1539         __tcp_inherit_port(sk, newsk);
1540
1541         return newsk;
1542
1543 exit_overflow:
1544         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1545 exit:
1546         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1547         dst_release(dst);
1548         return NULL;
1549 }
1550
1551 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1552 {
1553         struct tcphdr *th = skb->h.th;
1554         struct iphdr *iph = skb->nh.iph;
1555         struct tcp_sock *tp = tcp_sk(sk);
1556         struct sock *nsk;
1557         struct request_sock **prev;
1558         /* Find possible connection requests. */
1559         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1560                                                      iph->saddr, iph->daddr);
1561         if (req)
1562                 return tcp_check_req(sk, skb, req, prev);
1563
1564         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1565                                           th->source,
1566                                           skb->nh.iph->daddr,
1567                                           ntohs(th->dest),
1568                                           tcp_v4_iif(skb));
1569
1570         if (nsk) {
1571                 if (nsk->sk_state != TCP_TIME_WAIT) {
1572                         bh_lock_sock(nsk);
1573                         return nsk;
1574                 }
1575                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1576                 return NULL;
1577         }
1578
1579 #ifdef CONFIG_SYN_COOKIES
1580         if (!th->rst && !th->syn && th->ack)
1581                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1582 #endif
1583         return sk;
1584 }
1585
1586 static int tcp_v4_checksum_init(struct sk_buff *skb)
1587 {
1588         if (skb->ip_summed == CHECKSUM_HW) {
1589                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1590                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1591                                   skb->nh.iph->daddr, skb->csum))
1592                         return 0;
1593
1594                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1595                 skb->ip_summed = CHECKSUM_NONE;
1596         }
1597         if (skb->len <= 76) {
1598                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1599                                  skb->nh.iph->daddr,
1600                                  skb_checksum(skb, 0, skb->len, 0)))
1601                         return -1;
1602                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1603         } else {
1604                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1605                                           skb->nh.iph->saddr,
1606                                           skb->nh.iph->daddr, 0);
1607         }
1608         return 0;
1609 }
1610
1611
1612 /* The socket must have it's spinlock held when we get
1613  * here.
1614  *
1615  * We have a potential double-lock case here, so even when
1616  * doing backlog processing we use the BH locking scheme.
1617  * This is because we cannot sleep with the original spinlock
1618  * held.
1619  */
1620 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1621 {
1622         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1623                 TCP_CHECK_TIMER(sk);
1624                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1625                         goto reset;
1626                 TCP_CHECK_TIMER(sk);
1627                 return 0;
1628         }
1629
1630         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1631                 goto csum_err;
1632
1633         if (sk->sk_state == TCP_LISTEN) {
1634                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1635                 if (!nsk)
1636                         goto discard;
1637
1638                 if (nsk != sk) {
1639                         if (tcp_child_process(sk, nsk, skb))
1640                                 goto reset;
1641                         return 0;
1642                 }
1643         }
1644
1645         TCP_CHECK_TIMER(sk);
1646         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1647                 goto reset;
1648         TCP_CHECK_TIMER(sk);
1649         return 0;
1650
1651 reset:
1652         tcp_v4_send_reset(skb);
1653 discard:
1654         kfree_skb(skb);
1655         /* Be careful here. If this function gets more complicated and
1656          * gcc suffers from register pressure on the x86, sk (in %ebx)
1657          * might be destroyed here. This current version compiles correctly,
1658          * but you have been warned.
1659          */
1660         return 0;
1661
1662 csum_err:
1663         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1664         goto discard;
1665 }
1666
1667 /*
1668  *      From tcp_input.c
1669  */
1670
1671 int tcp_v4_rcv(struct sk_buff *skb)
1672 {
1673         struct tcphdr *th;
1674         struct sock *sk;
1675         int ret;
1676
1677         if (skb->pkt_type != PACKET_HOST)
1678                 goto discard_it;
1679
1680         /* Count it even if it's bad */
1681         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1682
1683         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1684                 goto discard_it;
1685
1686         th = skb->h.th;
1687
1688         if (th->doff < sizeof(struct tcphdr) / 4)
1689                 goto bad_packet;
1690         if (!pskb_may_pull(skb, th->doff * 4))
1691                 goto discard_it;
1692
1693         /* An explanation is required here, I think.
1694          * Packet length and doff are validated by header prediction,
1695          * provided case of th->doff==0 is elimineted.
1696          * So, we defer the checks. */
1697         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1698              tcp_v4_checksum_init(skb) < 0))
1699                 goto bad_packet;
1700
1701         th = skb->h.th;
1702         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1703         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1704                                     skb->len - th->doff * 4);
1705         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1706         TCP_SKB_CB(skb)->when    = 0;
1707         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1708         TCP_SKB_CB(skb)->sacked  = 0;
1709
1710         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1711                              skb->nh.iph->daddr, ntohs(th->dest),
1712                              tcp_v4_iif(skb));
1713
1714         if (!sk)
1715                 goto no_tcp_socket;
1716
1717 process:
1718         if (sk->sk_state == TCP_TIME_WAIT)
1719                 goto do_time_wait;
1720
1721         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1722                 goto discard_and_relse;
1723
1724         if (sk_filter(sk, skb, 0))
1725                 goto discard_and_relse;
1726
1727         skb->dev = NULL;
1728
1729         bh_lock_sock(sk);
1730         ret = 0;
1731         if (!sock_owned_by_user(sk)) {
1732                 if (!tcp_prequeue(sk, skb))
1733                         ret = tcp_v4_do_rcv(sk, skb);
1734         } else
1735                 sk_add_backlog(sk, skb);
1736         bh_unlock_sock(sk);
1737
1738         sock_put(sk);
1739
1740         return ret;
1741
1742 no_tcp_socket:
1743         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1744                 goto discard_it;
1745
1746         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1747 bad_packet:
1748                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1749         } else {
1750                 tcp_v4_send_reset(skb);
1751         }
1752
1753 discard_it:
1754         /* Discard frame. */
1755         kfree_skb(skb);
1756         return 0;
1757
1758 discard_and_relse:
1759         sock_put(sk);
1760         goto discard_it;
1761
1762 do_time_wait:
1763         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1764                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1765                 goto discard_it;
1766         }
1767
1768         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1769                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1770                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1771                 goto discard_it;
1772         }
1773         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1774                                            skb, th, skb->len)) {
1775         case TCP_TW_SYN: {
1776                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1777                                                           ntohs(th->dest),
1778                                                           tcp_v4_iif(skb));
1779                 if (sk2) {
1780                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1781                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1782                         sk = sk2;
1783                         goto process;
1784                 }
1785                 /* Fall through to ACK */
1786         }
1787         case TCP_TW_ACK:
1788                 tcp_v4_timewait_ack(sk, skb);
1789                 break;
1790         case TCP_TW_RST:
1791                 goto no_tcp_socket;
1792         case TCP_TW_SUCCESS:;
1793         }
1794         goto discard_it;
1795 }
1796
1797 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1798 {
1799         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1800         struct inet_sock *inet = inet_sk(sk);
1801
1802         sin->sin_family         = AF_INET;
1803         sin->sin_addr.s_addr    = inet->daddr;
1804         sin->sin_port           = inet->dport;
1805 }
1806
1807 /* VJ's idea. Save last timestamp seen from this destination
1808  * and hold it at least for normal timewait interval to use for duplicate
1809  * segment detection in subsequent connections, before they enter synchronized
1810  * state.
1811  */
1812
1813 int tcp_v4_remember_stamp(struct sock *sk)
1814 {
1815         struct inet_sock *inet = inet_sk(sk);
1816         struct tcp_sock *tp = tcp_sk(sk);
1817         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1818         struct inet_peer *peer = NULL;
1819         int release_it = 0;
1820
1821         if (!rt || rt->rt_dst != inet->daddr) {
1822                 peer = inet_getpeer(inet->daddr, 1);
1823                 release_it = 1;
1824         } else {
1825                 if (!rt->peer)
1826                         rt_bind_peer(rt, 1);
1827                 peer = rt->peer;
1828         }
1829
1830         if (peer) {
1831                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1832                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1834                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1835                         peer->tcp_ts = tp->rx_opt.ts_recent;
1836                 }
1837                 if (release_it)
1838                         inet_putpeer(peer);
1839                 return 1;
1840         }
1841
1842         return 0;
1843 }
1844
1845 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1846 {
1847         struct inet_peer *peer = NULL;
1848
1849         peer = inet_getpeer(tw->tw_daddr, 1);
1850
1851         if (peer) {
1852                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1853                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1854                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1855                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1856                         peer->tcp_ts = tw->tw_ts_recent;
1857                 }
1858                 inet_putpeer(peer);
1859                 return 1;
1860         }
1861
1862         return 0;
1863 }
1864
1865 struct tcp_func ipv4_specific = {
1866         .queue_xmit     =       ip_queue_xmit,
1867         .send_check     =       tcp_v4_send_check,
1868         .rebuild_header =       inet_sk_rebuild_header,
1869         .conn_request   =       tcp_v4_conn_request,
1870         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1871         .remember_stamp =       tcp_v4_remember_stamp,
1872         .net_header_len =       sizeof(struct iphdr),
1873         .setsockopt     =       ip_setsockopt,
1874         .getsockopt     =       ip_getsockopt,
1875         .addr2sockaddr  =       v4_addr2sockaddr,
1876         .sockaddr_len   =       sizeof(struct sockaddr_in),
1877 };
1878
1879 /* NOTE: A lot of things set to zero explicitly by call to
1880  *       sk_alloc() so need not be done here.
1881  */
1882 static int tcp_v4_init_sock(struct sock *sk)
1883 {
1884         struct tcp_sock *tp = tcp_sk(sk);
1885
1886         skb_queue_head_init(&tp->out_of_order_queue);
1887         tcp_init_xmit_timers(sk);
1888         tcp_prequeue_init(tp);
1889
1890         tp->rto  = TCP_TIMEOUT_INIT;
1891         tp->mdev = TCP_TIMEOUT_INIT;
1892
1893         /* So many TCP implementations out there (incorrectly) count the
1894          * initial SYN frame in their delayed-ACK and congestion control
1895          * algorithms that we must have the following bandaid to talk
1896          * efficiently to them.  -DaveM
1897          */
1898         tp->snd_cwnd = 2;
1899
1900         /* See draft-stevens-tcpca-spec-01 for discussion of the
1901          * initialization of these values.
1902          */
1903         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1904         tp->snd_cwnd_clamp = ~0;
1905         tp->mss_cache = 536;
1906
1907         tp->reordering = sysctl_tcp_reordering;
1908         tp->ca_ops = &tcp_init_congestion_ops;
1909
1910         sk->sk_state = TCP_CLOSE;
1911
1912         sk->sk_write_space = sk_stream_write_space;
1913         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1914
1915         tp->af_specific = &ipv4_specific;
1916
1917         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1918         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1919
1920         atomic_inc(&tcp_sockets_allocated);
1921
1922         return 0;
1923 }
1924
1925 int tcp_v4_destroy_sock(struct sock *sk)
1926 {
1927         struct tcp_sock *tp = tcp_sk(sk);
1928
1929         tcp_clear_xmit_timers(sk);
1930
1931         tcp_cleanup_congestion_control(tp);
1932
1933         /* Cleanup up the write buffer. */
1934         sk_stream_writequeue_purge(sk);
1935
1936         /* Cleans up our, hopefully empty, out_of_order_queue. */
1937         __skb_queue_purge(&tp->out_of_order_queue);
1938
1939         /* Clean prequeue, it must be empty really */
1940         __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942         /* Clean up a referenced TCP bind bucket. */
1943         if (tp->bind_hash)
1944                 tcp_put_port(sk);
1945
1946         /*
1947          * If sendmsg cached page exists, toss it.
1948          */
1949         if (sk->sk_sndmsg_page) {
1950                 __free_page(sk->sk_sndmsg_page);
1951                 sk->sk_sndmsg_page = NULL;
1952         }
1953
1954         atomic_dec(&tcp_sockets_allocated);
1955
1956         return 0;
1957 }
1958
1959 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1960
1961 #ifdef CONFIG_PROC_FS
1962 /* Proc filesystem TCP sock list dumping. */
1963
1964 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1965 {
1966         return hlist_empty(head) ? NULL :
1967                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1968 }
1969
1970 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1971 {
1972         return tw->tw_node.next ?
1973                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1974 }
1975
1976 static void *listening_get_next(struct seq_file *seq, void *cur)
1977 {
1978         struct tcp_sock *tp;
1979         struct hlist_node *node;
1980         struct sock *sk = cur;
1981         struct tcp_iter_state* st = seq->private;
1982
1983         if (!sk) {
1984                 st->bucket = 0;
1985                 sk = sk_head(&tcp_listening_hash[0]);
1986                 goto get_sk;
1987         }
1988
1989         ++st->num;
1990
1991         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1992                 struct request_sock *req = cur;
1993
1994                 tp = tcp_sk(st->syn_wait_sk);
1995                 req = req->dl_next;
1996                 while (1) {
1997                         while (req) {
1998                                 if (req->rsk_ops->family == st->family) {
1999                                         cur = req;
2000                                         goto out;
2001                                 }
2002                                 req = req->dl_next;
2003                         }
2004                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2005                                 break;
2006 get_req:
2007                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2008                 }
2009                 sk        = sk_next(st->syn_wait_sk);
2010                 st->state = TCP_SEQ_STATE_LISTENING;
2011                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2012         } else {
2013                 tp = tcp_sk(sk);
2014                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2015                 if (reqsk_queue_len(&tp->accept_queue))
2016                         goto start_req;
2017                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2018                 sk = sk_next(sk);
2019         }
2020 get_sk:
2021         sk_for_each_from(sk, node) {
2022                 if (sk->sk_family == st->family) {
2023                         cur = sk;
2024                         goto out;
2025                 }
2026                 tp = tcp_sk(sk);
2027                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2028                 if (reqsk_queue_len(&tp->accept_queue)) {
2029 start_req:
2030                         st->uid         = sock_i_uid(sk);
2031                         st->syn_wait_sk = sk;
2032                         st->state       = TCP_SEQ_STATE_OPENREQ;
2033                         st->sbucket     = 0;
2034                         goto get_req;
2035                 }
2036                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2037         }
2038         if (++st->bucket < INET_LHTABLE_SIZE) {
2039                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2040                 goto get_sk;
2041         }
2042         cur = NULL;
2043 out:
2044         return cur;
2045 }
2046
2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048 {
2049         void *rc = listening_get_next(seq, NULL);
2050
2051         while (rc && *pos) {
2052                 rc = listening_get_next(seq, rc);
2053                 --*pos;
2054         }
2055         return rc;
2056 }
2057
2058 static void *established_get_first(struct seq_file *seq)
2059 {
2060         struct tcp_iter_state* st = seq->private;
2061         void *rc = NULL;
2062
2063         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2064                 struct sock *sk;
2065                 struct hlist_node *node;
2066                 struct tcp_tw_bucket *tw;
2067
2068                 /* We can reschedule _before_ having picked the target: */
2069                 cond_resched_softirq();
2070
2071                 read_lock(&tcp_ehash[st->bucket].lock);
2072                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2073                         if (sk->sk_family != st->family) {
2074                                 continue;
2075                         }
2076                         rc = sk;
2077                         goto out;
2078                 }
2079                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2080                 tw_for_each(tw, node,
2081                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2082                         if (tw->tw_family != st->family) {
2083                                 continue;
2084                         }
2085                         rc = tw;
2086                         goto out;
2087                 }
2088                 read_unlock(&tcp_ehash[st->bucket].lock);
2089                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2090         }
2091 out:
2092         return rc;
2093 }
2094
2095 static void *established_get_next(struct seq_file *seq, void *cur)
2096 {
2097         struct sock *sk = cur;
2098         struct tcp_tw_bucket *tw;
2099         struct hlist_node *node;
2100         struct tcp_iter_state* st = seq->private;
2101
2102         ++st->num;
2103
2104         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2105                 tw = cur;
2106                 tw = tw_next(tw);
2107 get_tw:
2108                 while (tw && tw->tw_family != st->family) {
2109                         tw = tw_next(tw);
2110                 }
2111                 if (tw) {
2112                         cur = tw;
2113                         goto out;
2114                 }
2115                 read_unlock(&tcp_ehash[st->bucket].lock);
2116                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2117
2118                 /* We can reschedule between buckets: */
2119                 cond_resched_softirq();
2120
2121                 if (++st->bucket < tcp_ehash_size) {
2122                         read_lock(&tcp_ehash[st->bucket].lock);
2123                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2124                 } else {
2125                         cur = NULL;
2126                         goto out;
2127                 }
2128         } else
2129                 sk = sk_next(sk);
2130
2131         sk_for_each_from(sk, node) {
2132                 if (sk->sk_family == st->family)
2133                         goto found;
2134         }
2135
2136         st->state = TCP_SEQ_STATE_TIME_WAIT;
2137         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2138         goto get_tw;
2139 found:
2140         cur = sk;
2141 out:
2142         return cur;
2143 }
2144
2145 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2146 {
2147         void *rc = established_get_first(seq);
2148
2149         while (rc && pos) {
2150                 rc = established_get_next(seq, rc);
2151                 --pos;
2152         }               
2153         return rc;
2154 }
2155
2156 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2157 {
2158         void *rc;
2159         struct tcp_iter_state* st = seq->private;
2160
2161         tcp_listen_lock();
2162         st->state = TCP_SEQ_STATE_LISTENING;
2163         rc        = listening_get_idx(seq, &pos);
2164
2165         if (!rc) {
2166                 tcp_listen_unlock();
2167                 local_bh_disable();
2168                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169                 rc        = established_get_idx(seq, pos);
2170         }
2171
2172         return rc;
2173 }
2174
2175 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2176 {
2177         struct tcp_iter_state* st = seq->private;
2178         st->state = TCP_SEQ_STATE_LISTENING;
2179         st->num = 0;
2180         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2181 }
2182
2183 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2184 {
2185         void *rc = NULL;
2186         struct tcp_iter_state* st;
2187
2188         if (v == SEQ_START_TOKEN) {
2189                 rc = tcp_get_idx(seq, 0);
2190                 goto out;
2191         }
2192         st = seq->private;
2193
2194         switch (st->state) {
2195         case TCP_SEQ_STATE_OPENREQ:
2196         case TCP_SEQ_STATE_LISTENING:
2197                 rc = listening_get_next(seq, v);
2198                 if (!rc) {
2199                         tcp_listen_unlock();
2200                         local_bh_disable();
2201                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2202                         rc        = established_get_first(seq);
2203                 }
2204                 break;
2205         case TCP_SEQ_STATE_ESTABLISHED:
2206         case TCP_SEQ_STATE_TIME_WAIT:
2207                 rc = established_get_next(seq, v);
2208                 break;
2209         }
2210 out:
2211         ++*pos;
2212         return rc;
2213 }
2214
2215 static void tcp_seq_stop(struct seq_file *seq, void *v)
2216 {
2217         struct tcp_iter_state* st = seq->private;
2218
2219         switch (st->state) {
2220         case TCP_SEQ_STATE_OPENREQ:
2221                 if (v) {
2222                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2223                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2224                 }
2225         case TCP_SEQ_STATE_LISTENING:
2226                 if (v != SEQ_START_TOKEN)
2227                         tcp_listen_unlock();
2228                 break;
2229         case TCP_SEQ_STATE_TIME_WAIT:
2230         case TCP_SEQ_STATE_ESTABLISHED:
2231                 if (v)
2232                         read_unlock(&tcp_ehash[st->bucket].lock);
2233                 local_bh_enable();
2234                 break;
2235         }
2236 }
2237
2238 static int tcp_seq_open(struct inode *inode, struct file *file)
2239 {
2240         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2241         struct seq_file *seq;
2242         struct tcp_iter_state *s;
2243         int rc;
2244
2245         if (unlikely(afinfo == NULL))
2246                 return -EINVAL;
2247
2248         s = kmalloc(sizeof(*s), GFP_KERNEL);
2249         if (!s)
2250                 return -ENOMEM;
2251         memset(s, 0, sizeof(*s));
2252         s->family               = afinfo->family;
2253         s->seq_ops.start        = tcp_seq_start;
2254         s->seq_ops.next         = tcp_seq_next;
2255         s->seq_ops.show         = afinfo->seq_show;
2256         s->seq_ops.stop         = tcp_seq_stop;
2257
2258         rc = seq_open(file, &s->seq_ops);
2259         if (rc)
2260                 goto out_kfree;
2261         seq          = file->private_data;
2262         seq->private = s;
2263 out:
2264         return rc;
2265 out_kfree:
2266         kfree(s);
2267         goto out;
2268 }
2269
2270 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2271 {
2272         int rc = 0;
2273         struct proc_dir_entry *p;
2274
2275         if (!afinfo)
2276                 return -EINVAL;
2277         afinfo->seq_fops->owner         = afinfo->owner;
2278         afinfo->seq_fops->open          = tcp_seq_open;
2279         afinfo->seq_fops->read          = seq_read;
2280         afinfo->seq_fops->llseek        = seq_lseek;
2281         afinfo->seq_fops->release       = seq_release_private;
2282         
2283         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2284         if (p)
2285                 p->data = afinfo;
2286         else
2287                 rc = -ENOMEM;
2288         return rc;
2289 }
2290
2291 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2292 {
2293         if (!afinfo)
2294                 return;
2295         proc_net_remove(afinfo->name);
2296         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2297 }
2298
2299 static void get_openreq4(struct sock *sk, struct request_sock *req,
2300                          char *tmpbuf, int i, int uid)
2301 {
2302         const struct inet_request_sock *ireq = inet_rsk(req);
2303         int ttd = req->expires - jiffies;
2304
2305         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2306                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2307                 i,
2308                 ireq->loc_addr,
2309                 ntohs(inet_sk(sk)->sport),
2310                 ireq->rmt_addr,
2311                 ntohs(ireq->rmt_port),
2312                 TCP_SYN_RECV,
2313                 0, 0, /* could print option size, but that is af dependent. */
2314                 1,    /* timers active (only the expire timer) */
2315                 jiffies_to_clock_t(ttd),
2316                 req->retrans,
2317                 uid,
2318                 0,  /* non standard timer */
2319                 0, /* open_requests have no inode */
2320                 atomic_read(&sk->sk_refcnt),
2321                 req);
2322 }
2323
2324 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2325 {
2326         int timer_active;
2327         unsigned long timer_expires;
2328         struct tcp_sock *tp = tcp_sk(sp);
2329         struct inet_sock *inet = inet_sk(sp);
2330         unsigned int dest = inet->daddr;
2331         unsigned int src = inet->rcv_saddr;
2332         __u16 destp = ntohs(inet->dport);
2333         __u16 srcp = ntohs(inet->sport);
2334
2335         if (tp->pending == TCP_TIME_RETRANS) {
2336                 timer_active    = 1;
2337                 timer_expires   = tp->timeout;
2338         } else if (tp->pending == TCP_TIME_PROBE0) {
2339                 timer_active    = 4;
2340                 timer_expires   = tp->timeout;
2341         } else if (timer_pending(&sp->sk_timer)) {
2342                 timer_active    = 2;
2343                 timer_expires   = sp->sk_timer.expires;
2344         } else {
2345                 timer_active    = 0;
2346                 timer_expires = jiffies;
2347         }
2348
2349         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2350                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2351                 i, src, srcp, dest, destp, sp->sk_state,
2352                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2353                 timer_active,
2354                 jiffies_to_clock_t(timer_expires - jiffies),
2355                 tp->retransmits,
2356                 sock_i_uid(sp),
2357                 tp->probes_out,
2358                 sock_i_ino(sp),
2359                 atomic_read(&sp->sk_refcnt), sp,
2360                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2361                 tp->snd_cwnd,
2362                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2363 }
2364
2365 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2366 {
2367         unsigned int dest, src;
2368         __u16 destp, srcp;
2369         int ttd = tw->tw_ttd - jiffies;
2370
2371         if (ttd < 0)
2372                 ttd = 0;
2373
2374         dest  = tw->tw_daddr;
2375         src   = tw->tw_rcv_saddr;
2376         destp = ntohs(tw->tw_dport);
2377         srcp  = ntohs(tw->tw_sport);
2378
2379         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2380                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2381                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2382                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2383                 atomic_read(&tw->tw_refcnt), tw);
2384 }
2385
2386 #define TMPSZ 150
2387
2388 static int tcp4_seq_show(struct seq_file *seq, void *v)
2389 {
2390         struct tcp_iter_state* st;
2391         char tmpbuf[TMPSZ + 1];
2392
2393         if (v == SEQ_START_TOKEN) {
2394                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2395                            "  sl  local_address rem_address   st tx_queue "
2396                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2397                            "inode");
2398                 goto out;
2399         }
2400         st = seq->private;
2401
2402         switch (st->state) {
2403         case TCP_SEQ_STATE_LISTENING:
2404         case TCP_SEQ_STATE_ESTABLISHED:
2405                 get_tcp4_sock(v, tmpbuf, st->num);
2406                 break;
2407         case TCP_SEQ_STATE_OPENREQ:
2408                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2409                 break;
2410         case TCP_SEQ_STATE_TIME_WAIT:
2411                 get_timewait4_sock(v, tmpbuf, st->num);
2412                 break;
2413         }
2414         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2415 out:
2416         return 0;
2417 }
2418
2419 static struct file_operations tcp4_seq_fops;
2420 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421         .owner          = THIS_MODULE,
2422         .name           = "tcp",
2423         .family         = AF_INET,
2424         .seq_show       = tcp4_seq_show,
2425         .seq_fops       = &tcp4_seq_fops,
2426 };
2427
2428 int __init tcp4_proc_init(void)
2429 {
2430         return tcp_proc_register(&tcp4_seq_afinfo);
2431 }
2432
2433 void tcp4_proc_exit(void)
2434 {
2435         tcp_proc_unregister(&tcp4_seq_afinfo);
2436 }
2437 #endif /* CONFIG_PROC_FS */
2438
2439 struct proto tcp_prot = {
2440         .name                   = "TCP",
2441         .owner                  = THIS_MODULE,
2442         .close                  = tcp_close,
2443         .connect                = tcp_v4_connect,
2444         .disconnect             = tcp_disconnect,
2445         .accept                 = tcp_accept,
2446         .ioctl                  = tcp_ioctl,
2447         .init                   = tcp_v4_init_sock,
2448         .destroy                = tcp_v4_destroy_sock,
2449         .shutdown               = tcp_shutdown,
2450         .setsockopt             = tcp_setsockopt,
2451         .getsockopt             = tcp_getsockopt,
2452         .sendmsg                = tcp_sendmsg,
2453         .recvmsg                = tcp_recvmsg,
2454         .backlog_rcv            = tcp_v4_do_rcv,
2455         .hash                   = tcp_v4_hash,
2456         .unhash                 = tcp_unhash,
2457         .get_port               = tcp_v4_get_port,
2458         .enter_memory_pressure  = tcp_enter_memory_pressure,
2459         .sockets_allocated      = &tcp_sockets_allocated,
2460         .memory_allocated       = &tcp_memory_allocated,
2461         .memory_pressure        = &tcp_memory_pressure,
2462         .sysctl_mem             = sysctl_tcp_mem,
2463         .sysctl_wmem            = sysctl_tcp_wmem,
2464         .sysctl_rmem            = sysctl_tcp_rmem,
2465         .max_header             = MAX_TCP_HEADER,
2466         .obj_size               = sizeof(struct tcp_sock),
2467         .rsk_prot               = &tcp_request_sock_ops,
2468 };
2469
2470
2471
2472 void __init tcp_v4_init(struct net_proto_family *ops)
2473 {
2474         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2475         if (err < 0)
2476                 panic("Failed to create the TCP control socket.\n");
2477         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2478         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2479
2480         /* Unhash it so that IP input processing does not even
2481          * see it, we do not wish this socket to see incoming
2482          * packets.
2483          */
2484         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2485 }
2486
2487 EXPORT_SYMBOL(ipv4_specific);
2488 EXPORT_SYMBOL(tcp_bind_hash);
2489 EXPORT_SYMBOL(inet_bind_bucket_create);
2490 EXPORT_SYMBOL(tcp_hashinfo);
2491 EXPORT_SYMBOL(tcp_inherit_port);
2492 EXPORT_SYMBOL(tcp_listen_wlock);
2493 EXPORT_SYMBOL(tcp_port_rover);
2494 EXPORT_SYMBOL(tcp_prot);
2495 EXPORT_SYMBOL(tcp_put_port);
2496 EXPORT_SYMBOL(tcp_unhash);
2497 EXPORT_SYMBOL(tcp_v4_conn_request);
2498 EXPORT_SYMBOL(tcp_v4_connect);
2499 EXPORT_SYMBOL(tcp_v4_do_rcv);
2500 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2501 EXPORT_SYMBOL(tcp_v4_send_check);
2502 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2503
2504 #ifdef CONFIG_PROC_FS
2505 EXPORT_SYMBOL(tcp_proc_register);
2506 EXPORT_SYMBOL(tcp_proc_unregister);
2507 #endif
2508 EXPORT_SYMBOL(sysctl_local_port_range);
2509 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2510 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2511