net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9  *
  10  *              IPv4 specific functions
  11  *
  12  *
  13  *              code split from:
  14  *              linux/ipv4/tcp.c
  15  *              linux/ipv4/tcp_input.c
  16  *              linux/ipv4/tcp_output.c
  17  *
  18  *              See tcp.c for author information
  19  *
  20  *      This program is free software; you can redistribute it and/or
  21  *      modify it under the terms of the GNU General Public License
  22  *      as published by the Free Software Foundation; either version
  23  *      2 of the License, or (at your option) any later version.
  24  */
  25
  26 /*
  27  * Changes:
  28  *              David S. Miller :       New socket lookup architecture.
  29  *                                      This code is dedicated to John Dyson.
  30  *              David S. Miller :       Change semantics of established hash,
  31  *                                      half is devoted to TIME_WAIT sockets
  32  *                                      and the rest go in the other half.
  33  *              Andi Kleen :            Add support for syncookies and fixed
  34  *                                      some bugs: ip options weren't passed to
  35  *                                      the TCP layer, missed a check for an
  36  *                                      ACK bit.
  37  *              Andi Kleen :            Implemented fast path mtu discovery.
  38  *                                      Fixed many serious bugs in the
  39  *                                      request_sock handling and moved
  40  *                                      most of it into the af independent code.
  41  *                                      Added tail drop and some other bugfixes.
  42  *                                      Added new listen semantics.
  43  *              Mike McLagan    :       Routing by source
  44  *      Juan Jose Ciarlante:            ip_dynaddr bits
  45  *              Andi Kleen:             various fixes.
  46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  47  *                                      coma.
  48  *      Andi Kleen              :       Fix new listen.
  49  *      Andi Kleen              :       Fix accept error reporting.
  50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  52  *                                      a single port at the same time.
  53  */
  54
  55
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64
  65 #include <net/net_namespace.h>
  66 #include <net/icmp.h>
  67 #include <net/inet_hashtables.h>
  68 #include <net/tcp.h>
  69 #include <net/transp_v6.h>
  70 #include <net/ipv6.h>
  71 #include <net/inet_common.h>
  72 #include <net/timewait_sock.h>
  73 #include <net/xfrm.h>
  74 #include <net/netdma.h>
  75
  76 #include <linux/inet.h>
  77 #include <linux/ipv6.h>
  78 #include <linux/stddef.h>
  79 #include <linux/proc_fs.h>
  80 #include <linux/seq_file.h>
  81
  82 #include <linux/crypto.h>
  83 #include <linux/scatterlist.h>
  84
  85 int sysctl_tcp_tw_reuse __read_mostly;
  86 int sysctl_tcp_low_latency __read_mostly;
  87
  88 /* Check TCP sequence numbers in ICMP packets. */
  89 #define ICMP_MIN_LENGTH 8
  90
  91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  95                                                    __be32 addr);
  96 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
  97                                    __be32 saddr, __be32 daddr,
  98                                    struct tcphdr *th, int protocol,
  99                                    unsigned int tcplen);
 100 #endif
 101
 102 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 103         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 104         .lhash_users = ATOMIC_INIT(0),
 105         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 106 };
 107
 108 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 109 {
 110         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 111                                           ip_hdr(skb)->saddr,
 112                                           tcp_hdr(skb)->dest,
 113                                           tcp_hdr(skb)->source);
 114 }
 115
 116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 117 {
 118         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 119         struct tcp_sock *tp = tcp_sk(sk);
 120
 121         /* With PAWS, it is safe from the viewpoint
 122            of data integrity. Even without PAWS it is safe provided sequence
 123            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 124
 125            Actually, the idea is close to VJ's one, only timestamp cache is
 126            held not per host, but per port pair and TW bucket is used as state
 127            holder.
 128
 129            If TW bucket has been already destroyed we fall back to VJ's scheme
 130            and use initial timestamp retrieved from peer table.
 131          */
 132         if (tcptw->tw_ts_recent_stamp &&
 133             (twp == NULL || (sysctl_tcp_tw_reuse &&
 134                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 135                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 136                 if (tp->write_seq == 0)
 137                         tp->write_seq = 1;
 138                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 139                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 140                 sock_hold(sktw);
 141                 return 1;
 142         }
 143
 144         return 0;
 145 }
 146
 147 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct inet_sock *inet = inet_sk(sk);
 153         struct tcp_sock *tp = tcp_sk(sk);
 154         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 155         struct rtable *rt;
 156         __be32 daddr, nexthop;
 157         int tmp;
 158         int err;
 159
 160         if (addr_len < sizeof(struct sockaddr_in))
 161                 return -EINVAL;
 162
 163         if (usin->sin_family != AF_INET)
 164                 return -EAFNOSUPPORT;
 165
 166         nexthop = daddr = usin->sin_addr.s_addr;
 167         if (inet->opt && inet->opt->srr) {
 168                 if (!daddr)
 169                         return -EINVAL;
 170                 nexthop = inet->opt->faddr;
 171         }
 172
 173         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 174                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 175                                IPPROTO_TCP,
 176                                inet->sport, usin->sin_port, sk, 1);
 177         if (tmp < 0) {
 178                 if (tmp == -ENETUNREACH)
 179                         IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 180                 return tmp;
 181         }
 182
 183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 184                 ip_rt_put(rt);
 185                 return -ENETUNREACH;
 186         }
 187
 188         if (!inet->opt || !inet->opt->srr)
 189                 daddr = rt->rt_dst;
 190
 191         if (!inet->saddr)
 192                 inet->saddr = rt->rt_src;
 193         inet->rcv_saddr = inet->saddr;
 194
 195         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 196                 /* Reset inherited state */
 197                 tp->rx_opt.ts_recent       = 0;
 198                 tp->rx_opt.ts_recent_stamp = 0;
 199                 tp->write_seq              = 0;
 200         }
 201
 202         if (tcp_death_row.sysctl_tw_recycle &&
 203             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 204                 struct inet_peer *peer = rt_get_peer(rt);
 205                 /*
 206                  * VJ's idea. We save last timestamp seen from
 207                  * the destination in peer table, when entering state
 208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 209                  * when trying new connection.
 210                  */
 211                 if (peer != NULL &&
 212                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 213                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 214                         tp->rx_opt.ts_recent = peer->tcp_ts;
 215                 }
 216         }
 217
 218         inet->dport = usin->sin_port;
 219         inet->daddr = daddr;
 220
 221         inet_csk(sk)->icsk_ext_hdr_len = 0;
 222         if (inet->opt)
 223                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 224
 225         tp->rx_opt.mss_clamp = 536;
 226
 227         /* Socket identity is still unknown (sport may be zero).
 228          * However we set state to SYN-SENT and not releasing socket
 229          * lock select source port, enter ourselves into the hash tables and
 230          * complete initialization after this.
 231          */
 232         tcp_set_state(sk, TCP_SYN_SENT);
 233         err = inet_hash_connect(&tcp_death_row, sk);
 234         if (err)
 235                 goto failure;
 236
 237         err = ip_route_newports(&rt, IPPROTO_TCP,
 238                                 inet->sport, inet->dport, sk);
 239         if (err)
 240                 goto failure;
 241
 242         /* OK, now commit destination to socket.  */
 243         sk->sk_gso_type = SKB_GSO_TCPV4;
 244         sk_setup_caps(sk, &rt->u.dst);
 245
 246         if (!tp->write_seq)
 247                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 248                                                            inet->daddr,
 249                                                            inet->sport,
 250                                                            usin->sin_port);
 251
 252         inet->id = tp->write_seq ^ jiffies;
 253
 254         err = tcp_connect(sk);
 255         rt = NULL;
 256         if (err)
 257                 goto failure;
 258
 259         return 0;
 260
 261 failure:
 262         /*
 263          * This unhashes the socket and releases the local port,
 264          * if necessary.
 265          */
 266         tcp_set_state(sk, TCP_CLOSE);
 267         ip_rt_put(rt);
 268         sk->sk_route_caps = 0;
 269         inet->dport = 0;
 270         return err;
 271 }
 272
 273 /*
 274  * This routine does path mtu discovery as defined in RFC1191.
 275  */
 276 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 277 {
 278         struct dst_entry *dst;
 279         struct inet_sock *inet = inet_sk(sk);
 280
 281         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 282          * send out by Linux are always <576bytes so they should go through
 283          * unfragmented).
 284          */
 285         if (sk->sk_state == TCP_LISTEN)
 286                 return;
 287
 288         /* We don't check in the destentry if pmtu discovery is forbidden
 289          * on this route. We just assume that no packet_to_big packets
 290          * are send back when pmtu discovery is not active.
 291          * There is a small race when the user changes this flag in the
 292          * route, but I think that's acceptable.
 293          */
 294         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 295                 return;
 296
 297         dst->ops->update_pmtu(dst, mtu);
 298
 299         /* Something is about to be wrong... Remember soft error
 300          * for the case, if this connection will not able to recover.
 301          */
 302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                 sk->sk_err_soft = EMSGSIZE;
 304
 305         mtu = dst_mtu(dst);
 306
 307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                 tcp_sync_mss(sk, mtu);
 310
 311                 /* Resend the TCP packet because it's
 312                  * clear that the old packet has been
 313                  * dropped. This is the new "fast" path mtu
 314                  * discovery.
 315                  */
 316                 tcp_simple_retransmit(sk);
 317         } /* else let the usual retransmit timer handle it */
 318 }
 319
 320 /*
 321  * This routine is called by the ICMP module when it gets some
 322  * sort of error condition.  If err < 0 then the socket should
 323  * be closed and the error returned to the user.  If err > 0
 324  * it's just the icmp type << 8 | icmp code.  After adjustment
 325  * header points to the first 8 bytes of the tcp header.  We need
 326  * to find the appropriate port.
 327  *
 328  * The locking strategy used here is very "optimistic". When
 329  * someone else accesses the socket the ICMP is just dropped
 330  * and for some paths there is no check at all.
 331  * A more general error queue to queue errors for later handling
 332  * is probably better.
 333  *
 334  */
 335
 336 void tcp_v4_err(struct sk_buff *skb, u32 info)
 337 {
 338         struct iphdr *iph = (struct iphdr *)skb->data;
 339         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 340         struct tcp_sock *tp;
 341         struct inet_sock *inet;
 342         const int type = icmp_hdr(skb)->type;
 343         const int code = icmp_hdr(skb)->code;
 344         struct sock *sk;
 345         __u32 seq;
 346         int err;
 347
 348         if (skb->len < (iph->ihl << 2) + 8) {
 349                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 350                 return;
 351         }
 352
 353         sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest,
 354                         iph->saddr, th->source, inet_iif(skb));
 355         if (!sk) {
 356                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 357                 return;
 358         }
 359         if (sk->sk_state == TCP_TIME_WAIT) {
 360                 inet_twsk_put(inet_twsk(sk));
 361                 return;
 362         }
 363
 364         bh_lock_sock(sk);
 365         /* If too many ICMPs get dropped on busy
 366          * servers this needs to be solved differently.
 367          */
 368         if (sock_owned_by_user(sk))
 369                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 370
 371         if (sk->sk_state == TCP_CLOSE)
 372                 goto out;
 373
 374         tp = tcp_sk(sk);
 375         seq = ntohl(th->seq);
 376         if (sk->sk_state != TCP_LISTEN &&
 377             !between(seq, tp->snd_una, tp->snd_nxt)) {
 378                 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 379                 goto out;
 380         }
 381
 382         switch (type) {
 383         case ICMP_SOURCE_QUENCH:
 384                 /* Just silently ignore these. */
 385                 goto out;
 386         case ICMP_PARAMETERPROB:
 387                 err = EPROTO;
 388                 break;
 389         case ICMP_DEST_UNREACH:
 390                 if (code > NR_ICMP_UNREACH)
 391                         goto out;
 392
 393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 394                         if (!sock_owned_by_user(sk))
 395                                 do_pmtu_discovery(sk, iph, info);
 396                         goto out;
 397                 }
 398
 399                 err = icmp_err_convert[code].errno;
 400                 break;
 401         case ICMP_TIME_EXCEEDED:
 402                 err = EHOSTUNREACH;
 403                 break;
 404         default:
 405                 goto out;
 406         }
 407
 408         switch (sk->sk_state) {
 409                 struct request_sock *req, **prev;
 410         case TCP_LISTEN:
 411                 if (sock_owned_by_user(sk))
 412                         goto out;
 413
 414                 req = inet_csk_search_req(sk, &prev, th->dest,
 415                                           iph->daddr, iph->saddr);
 416                 if (!req)
 417                         goto out;
 418
 419                 /* ICMPs are not backlogged, hence we cannot get
 420                    an established socket here.
 421                  */
 422                 BUG_TRAP(!req->sk);
 423
 424                 if (seq != tcp_rsk(req)->snt_isn) {
 425                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 426                         goto out;
 427                 }
 428
 429                 /*
 430                  * Still in SYN_RECV, just remove it silently.
 431                  * There is no good way to pass the error to the newly
 432                  * created socket, and POSIX does not want network
 433                  * errors returned from accept().
 434                  */
 435                 inet_csk_reqsk_queue_drop(sk, req, prev);
 436                 goto out;
 437
 438         case TCP_SYN_SENT:
 439         case TCP_SYN_RECV:  /* Cannot happen.
 440                                It can f.e. if SYNs crossed.
 441                              */
 442                 if (!sock_owned_by_user(sk)) {
 443                         sk->sk_err = err;
 444
 445                         sk->sk_error_report(sk);
 446
 447                         tcp_done(sk);
 448                 } else {
 449                         sk->sk_err_soft = err;
 450                 }
 451                 goto out;
 452         }
 453
 454         /* If we've already connected we will keep trying
 455          * until we time out, or the user gives up.
 456          *
 457          * rfc1122 4.2.3.9 allows to consider as hard errors
 458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 459          * but it is obsoleted by pmtu discovery).
 460          *
 461          * Note, that in modern internet, where routing is unreliable
 462          * and in each dark corner broken firewalls sit, sending random
 463          * errors ordered by their masters even this two messages finally lose
 464          * their original sense (even Linux sends invalid PORT_UNREACHs)
 465          *
 466          * Now we are in compliance with RFCs.
 467          *                                                      --ANK (980905)
 468          */
 469
 470         inet = inet_sk(sk);
 471         if (!sock_owned_by_user(sk) && inet->recverr) {
 472                 sk->sk_err = err;
 473                 sk->sk_error_report(sk);
 474         } else  { /* Only an error on timeout */
 475                 sk->sk_err_soft = err;
 476         }
 477
 478 out:
 479         bh_unlock_sock(sk);
 480         sock_put(sk);
 481 }
 482
 483 /* This routine computes an IPv4 TCP checksum. */
 484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 485 {
 486         struct inet_sock *inet = inet_sk(sk);
 487         struct tcphdr *th = tcp_hdr(skb);
 488
 489         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 490                 th->check = ~tcp_v4_check(len, inet->saddr,
 491                                           inet->daddr, 0);
 492                 skb->csum_start = skb_transport_header(skb) - skb->head;
 493                 skb->csum_offset = offsetof(struct tcphdr, check);
 494         } else {
 495                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 496                                          csum_partial((char *)th,
 497                                                       th->doff << 2,
 498                                                       skb->csum));
 499         }
 500 }
 501
 502 int tcp_v4_gso_send_check(struct sk_buff *skb)
 503 {
 504         const struct iphdr *iph;
 505         struct tcphdr *th;
 506
 507         if (!pskb_may_pull(skb, sizeof(*th)))
 508                 return -EINVAL;
 509
 510         iph = ip_hdr(skb);
 511         th = tcp_hdr(skb);
 512
 513         th->check = 0;
 514         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 515         skb->csum_start = skb_transport_header(skb) - skb->head;
 516         skb->csum_offset = offsetof(struct tcphdr, check);
 517         skb->ip_summed = CHECKSUM_PARTIAL;
 518         return 0;
 519 }
 520
 521 /*
 522  *      This routine will send an RST to the other tcp.
 523  *
 524  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 525  *                    for reset.
 526  *      Answer: if a packet caused RST, it is not for a socket
 527  *              existing in our system, if it is matched to a socket,
 528  *              it is just duplicate segment or bug in other side's TCP.
 529  *              So that we build reply only basing on parameters
 530  *              arrived with segment.
 531  *      Exception: precedence violation. We do not implement it in any case.
 532  */
 533
 534 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 535 {
 536         struct tcphdr *th = tcp_hdr(skb);
 537         struct {
 538                 struct tcphdr th;
 539 #ifdef CONFIG_TCP_MD5SIG
 540                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 541 #endif
 542         } rep;
 543         struct ip_reply_arg arg;
 544 #ifdef CONFIG_TCP_MD5SIG
 545         struct tcp_md5sig_key *key;
 546 #endif
 547
 548         /* Never send a reset in response to a reset. */
 549         if (th->rst)
 550                 return;
 551
 552         if (skb->rtable->rt_type != RTN_LOCAL)
 553                 return;
 554
 555         /* Swap the send and the receive. */
 556         memset(&rep, 0, sizeof(rep));
 557         rep.th.dest   = th->source;
 558         rep.th.source = th->dest;
 559         rep.th.doff   = sizeof(struct tcphdr) / 4;
 560         rep.th.rst    = 1;
 561
 562         if (th->ack) {
 563                 rep.th.seq = th->ack_seq;
 564         } else {
 565                 rep.th.ack = 1;
 566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 567                                        skb->len - (th->doff << 2));
 568         }
 569
 570         memset(&arg, 0, sizeof(arg));
 571         arg.iov[0].iov_base = (unsigned char *)&rep;
 572         arg.iov[0].iov_len  = sizeof(rep.th);
 573
 574 #ifdef CONFIG_TCP_MD5SIG
 575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 576         if (key) {
 577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 578                                    (TCPOPT_NOP << 16) |
 579                                    (TCPOPT_MD5SIG << 8) |
 580                                    TCPOLEN_MD5SIG);
 581                 /* Update length and the length the header thinks exists */
 582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 583                 rep.th.doff = arg.iov[0].iov_len / 4;
 584
 585                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 586                                         key,
 587                                         ip_hdr(skb)->daddr,
 588                                         ip_hdr(skb)->saddr,
 589                                         &rep.th, IPPROTO_TCP,
 590                                         arg.iov[0].iov_len);
 591         }
 592 #endif
 593         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 594                                       ip_hdr(skb)->saddr, /* XXX */
 595                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
 596         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 597
 598         ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb,
 599                       &arg, arg.iov[0].iov_len);
 600
 601         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 602         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 603 }
 604
 605 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 606    outside socket context is ugly, certainly. What can I do?
 607  */
 608
 609 static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
 610                             struct sk_buff *skb, u32 seq, u32 ack,
 611                             u32 win, u32 ts)
 612 {
 613         struct tcphdr *th = tcp_hdr(skb);
 614         struct {
 615                 struct tcphdr th;
 616                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 617 #ifdef CONFIG_TCP_MD5SIG
 618                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 619 #endif
 620                         ];
 621         } rep;
 622         struct ip_reply_arg arg;
 623 #ifdef CONFIG_TCP_MD5SIG
 624         struct tcp_md5sig_key *key;
 625         struct tcp_md5sig_key tw_key;
 626 #endif
 627
 628         memset(&rep.th, 0, sizeof(struct tcphdr));
 629         memset(&arg, 0, sizeof(arg));
 630
 631         arg.iov[0].iov_base = (unsigned char *)&rep;
 632         arg.iov[0].iov_len  = sizeof(rep.th);
 633         if (ts) {
 634                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 635                                    (TCPOPT_TIMESTAMP << 8) |
 636                                    TCPOLEN_TIMESTAMP);
 637                 rep.opt[1] = htonl(tcp_time_stamp);
 638                 rep.opt[2] = htonl(ts);
 639                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 640         }
 641
 642         /* Swap the send and the receive. */
 643         rep.th.dest    = th->source;
 644         rep.th.source  = th->dest;
 645         rep.th.doff    = arg.iov[0].iov_len / 4;
 646         rep.th.seq     = htonl(seq);
 647         rep.th.ack_seq = htonl(ack);
 648         rep.th.ack     = 1;
 649         rep.th.window  = htons(win);
 650
 651 #ifdef CONFIG_TCP_MD5SIG
 652         /*
 653          * The SKB holds an imcoming packet, but may not have a valid ->sk
 654          * pointer. This is especially the case when we're dealing with a
 655          * TIME_WAIT ack, because the sk structure is long gone, and only
 656          * the tcp_timewait_sock remains. So the md5 key is stashed in that
 657          * structure, and we use it in preference.  I believe that (twsk ||
 658          * skb->sk) holds true, but we program defensively.
 659          */
 660         if (!twsk && skb->sk) {
 661                 key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
 662         } else if (twsk && twsk->tw_md5_keylen) {
 663                 tw_key.key = twsk->tw_md5_key;
 664                 tw_key.keylen = twsk->tw_md5_keylen;
 665                 key = &tw_key;
 666         } else
 667                 key = NULL;
 668
 669         if (key) {
 670                 int offset = (ts) ? 3 : 0;
 671
 672                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 673                                           (TCPOPT_NOP << 16) |
 674                                           (TCPOPT_MD5SIG << 8) |
 675                                           TCPOLEN_MD5SIG);
 676                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 677                 rep.th.doff = arg.iov[0].iov_len/4;
 678
 679                 tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 680                                         key,
 681                                         ip_hdr(skb)->daddr,
 682                                         ip_hdr(skb)->saddr,
 683                                         &rep.th, IPPROTO_TCP,
 684                                         arg.iov[0].iov_len);
 685         }
 686 #endif
 687         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 688                                       ip_hdr(skb)->saddr, /* XXX */
 689                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 690         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 691         if (twsk)
 692                 arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
 693
 694         ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb,
 695                       &arg, arg.iov[0].iov_len);
 696
 697         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 698 }
 699
 700 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 701 {
 702         struct inet_timewait_sock *tw = inet_twsk(sk);
 703         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 704
 705         tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 706                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 707                         tcptw->tw_ts_recent);
 708
 709         inet_twsk_put(tw);
 710 }
 711
 712 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 713                                   struct request_sock *req)
 714 {
 715         tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
 716                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 717                         req->ts_recent);
 718 }
 719
 720 /*
 721  *      Send a SYN-ACK after having received a SYN.
 722  *      This still operates on a request_sock only, not on a big
 723  *      socket.
 724  */
 725 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 726                                 struct dst_entry *dst)
 727 {
 728         const struct inet_request_sock *ireq = inet_rsk(req);
 729         int err = -1;
 730         struct sk_buff * skb;
 731
 732         /* First, grab a route. */
 733         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 734                 return -1;
 735
 736         skb = tcp_make_synack(sk, dst, req);
 737
 738         if (skb) {
 739                 struct tcphdr *th = tcp_hdr(skb);
 740
 741                 th->check = tcp_v4_check(skb->len,
 742                                          ireq->loc_addr,
 743                                          ireq->rmt_addr,
 744                                          csum_partial((char *)th, skb->len,
 745                                                       skb->csum));
 746
 747                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 748                                             ireq->rmt_addr,
 749                                             ireq->opt);
 750                 err = net_xmit_eval(err);
 751         }
 752
 753         dst_release(dst);
 754         return err;
 755 }
 756
 757 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
 758 {
 759         return __tcp_v4_send_synack(sk, req, NULL);
 760 }
 761
 762 /*
 763  *      IPv4 request_sock destructor.
 764  */
 765 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 766 {
 767         kfree(inet_rsk(req)->opt);
 768 }
 769
 770 #ifdef CONFIG_SYN_COOKIES
 771 static void syn_flood_warning(struct sk_buff *skb)
 772 {
 773         static unsigned long warntime;
 774
 775         if (time_after(jiffies, (warntime + HZ * 60))) {
 776                 warntime = jiffies;
 777                 printk(KERN_INFO
 778                        "possible SYN flooding on port %d. Sending cookies.\n",
 779                        ntohs(tcp_hdr(skb)->dest));
 780         }
 781 }
 782 #endif
 783
 784 /*
 785  * Save and compile IPv4 options into the request_sock if needed.
 786  */
 787 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 788                                               struct sk_buff *skb)
 789 {
 790         struct ip_options *opt = &(IPCB(skb)->opt);
 791         struct ip_options *dopt = NULL;
 792
 793         if (opt && opt->optlen) {
 794                 int opt_size = optlength(opt);
 795                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 796                 if (dopt) {
 797                         if (ip_options_echo(dopt, skb)) {
 798                                 kfree(dopt);
 799                                 dopt = NULL;
 800                         }
 801                 }
 802         }
 803         return dopt;
 804 }
 805
 806 #ifdef CONFIG_TCP_MD5SIG
 807 /*
 808  * RFC2385 MD5 checksumming requires a mapping of
 809  * IP address->MD5 Key.
 810  * We need to maintain these in the sk structure.
 811  */
 812
 813 /* Find the Key structure for an address.  */
 814 static struct tcp_md5sig_key *
 815                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 816 {
 817         struct tcp_sock *tp = tcp_sk(sk);
 818         int i;
 819
 820         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 821                 return NULL;
 822         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 823                 if (tp->md5sig_info->keys4[i].addr == addr)
 824                         return &tp->md5sig_info->keys4[i].base;
 825         }
 826         return NULL;
 827 }
 828
 829 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 830                                          struct sock *addr_sk)
 831 {
 832         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 833 }
 834
 835 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 836
 837 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 838                                                       struct request_sock *req)
 839 {
 840         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 841 }
 842
 843 /* This can be called on a newly created socket, from other files */
 844 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 845                       u8 *newkey, u8 newkeylen)
 846 {
 847         /* Add Key to the list */
 848         struct tcp_md5sig_key *key;
 849         struct tcp_sock *tp = tcp_sk(sk);
 850         struct tcp4_md5sig_key *keys;
 851
 852         key = tcp_v4_md5_do_lookup(sk, addr);
 853         if (key) {
 854                 /* Pre-existing entry - just update that one. */
 855                 kfree(key->key);
 856                 key->key = newkey;
 857                 key->keylen = newkeylen;
 858         } else {
 859                 struct tcp_md5sig_info *md5sig;
 860
 861                 if (!tp->md5sig_info) {
 862                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 863                                                   GFP_ATOMIC);
 864                         if (!tp->md5sig_info) {
 865                                 kfree(newkey);
 866                                 return -ENOMEM;
 867                         }
 868                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 869                 }
 870                 if (tcp_alloc_md5sig_pool() == NULL) {
 871                         kfree(newkey);
 872                         return -ENOMEM;
 873                 }
 874                 md5sig = tp->md5sig_info;
 875
 876                 if (md5sig->alloced4 == md5sig->entries4) {
 877                         keys = kmalloc((sizeof(*keys) *
 878                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 879                         if (!keys) {
 880                                 kfree(newkey);
 881                                 tcp_free_md5sig_pool();
 882                                 return -ENOMEM;
 883                         }
 884
 885                         if (md5sig->entries4)
 886                                 memcpy(keys, md5sig->keys4,
 887                                        sizeof(*keys) * md5sig->entries4);
 888
 889                         /* Free old key list, and reference new one */
 890                         kfree(md5sig->keys4);
 891                         md5sig->keys4 = keys;
 892                         md5sig->alloced4++;
 893                 }
 894                 md5sig->entries4++;
 895                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 896                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 897                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 898         }
 899         return 0;
 900 }
 901
 902 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 903
 904 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 905                                u8 *newkey, u8 newkeylen)
 906 {
 907         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 908                                  newkey, newkeylen);
 909 }
 910
 911 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 912 {
 913         struct tcp_sock *tp = tcp_sk(sk);
 914         int i;
 915
 916         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 917                 if (tp->md5sig_info->keys4[i].addr == addr) {
 918                         /* Free the key */
 919                         kfree(tp->md5sig_info->keys4[i].base.key);
 920                         tp->md5sig_info->entries4--;
 921
 922                         if (tp->md5sig_info->entries4 == 0) {
 923                                 kfree(tp->md5sig_info->keys4);
 924                                 tp->md5sig_info->keys4 = NULL;
 925                                 tp->md5sig_info->alloced4 = 0;
 926                         } else if (tp->md5sig_info->entries4 != i) {
 927                                 /* Need to do some manipulation */
 928                                 memmove(&tp->md5sig_info->keys4[i],
 929                                         &tp->md5sig_info->keys4[i+1],
 930                                         (tp->md5sig_info->entries4 - i) *
 931                                          sizeof(struct tcp4_md5sig_key));
 932                         }
 933                         tcp_free_md5sig_pool();
 934                         return 0;
 935                 }
 936         }
 937         return -ENOENT;
 938 }
 939
 940 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 941
 942 static void tcp_v4_clear_md5_list(struct sock *sk)
 943 {
 944         struct tcp_sock *tp = tcp_sk(sk);
 945
 946         /* Free each key, then the set of key keys,
 947          * the crypto element, and then decrement our
 948          * hold on the last resort crypto.
 949          */
 950         if (tp->md5sig_info->entries4) {
 951                 int i;
 952                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 953                         kfree(tp->md5sig_info->keys4[i].base.key);
 954                 tp->md5sig_info->entries4 = 0;
 955                 tcp_free_md5sig_pool();
 956         }
 957         if (tp->md5sig_info->keys4) {
 958                 kfree(tp->md5sig_info->keys4);
 959                 tp->md5sig_info->keys4 = NULL;
 960                 tp->md5sig_info->alloced4  = 0;
 961         }
 962 }
 963
 964 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 965                                  int optlen)
 966 {
 967         struct tcp_md5sig cmd;
 968         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 969         u8 *newkey;
 970
 971         if (optlen < sizeof(cmd))
 972                 return -EINVAL;
 973
 974         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 975                 return -EFAULT;
 976
 977         if (sin->sin_family != AF_INET)
 978                 return -EINVAL;
 979
 980         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 981                 if (!tcp_sk(sk)->md5sig_info)
 982                         return -ENOENT;
 983                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 984         }
 985
 986         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 987                 return -EINVAL;
 988
 989         if (!tcp_sk(sk)->md5sig_info) {
 990                 struct tcp_sock *tp = tcp_sk(sk);
 991                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 992
 993                 if (!p)
 994                         return -EINVAL;
 995
 996                 tp->md5sig_info = p;
 997                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 998         }
 999
1000         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1001         if (!newkey)
1002                 return -ENOMEM;
1003         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1004                                  newkey, cmd.tcpm_keylen);
1005 }
1006
1007 static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1008                                    __be32 saddr, __be32 daddr,
1009                                    struct tcphdr *th, int protocol,
1010                                    unsigned int tcplen)
1011 {
1012         struct scatterlist sg[4];
1013         __u16 data_len;
1014         int block = 0;
1015         __sum16 old_checksum;
1016         struct tcp_md5sig_pool *hp;
1017         struct tcp4_pseudohdr *bp;
1018         struct hash_desc *desc;
1019         int err;
1020         unsigned int nbytes = 0;
1021
1022         /*
1023          * Okay, so RFC2385 is turned on for this connection,
1024          * so we need to generate the MD5 hash for the packet now.
1025          */
1026
1027         hp = tcp_get_md5sig_pool();
1028         if (!hp)
1029                 goto clear_hash_noput;
1030
1031         bp = &hp->md5_blk.ip4;
1032         desc = &hp->md5_desc;
1033
1034         /*
1035          * 1. the TCP pseudo-header (in the order: source IP address,
1036          * destination IP address, zero-padded protocol number, and
1037          * segment length)
1038          */
1039         bp->saddr = saddr;
1040         bp->daddr = daddr;
1041         bp->pad = 0;
1042         bp->protocol = protocol;
1043         bp->len = htons(tcplen);
1044
1045         sg_init_table(sg, 4);
1046
1047         sg_set_buf(&sg[block++], bp, sizeof(*bp));
1048         nbytes += sizeof(*bp);
1049
1050         /* 2. the TCP header, excluding options, and assuming a
1051          * checksum of zero/
1052          */
1053         old_checksum = th->check;
1054         th->check = 0;
1055         sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1056         nbytes += sizeof(struct tcphdr);
1057
1058         /* 3. the TCP segment data (if any) */
1059         data_len = tcplen - (th->doff << 2);
1060         if (data_len > 0) {
1061                 unsigned char *data = (unsigned char *)th + (th->doff << 2);
1062                 sg_set_buf(&sg[block++], data, data_len);
1063                 nbytes += data_len;
1064         }
1065
1066         /* 4. an independently-specified key or password, known to both
1067          * TCPs and presumably connection-specific
1068          */
1069         sg_set_buf(&sg[block++], key->key, key->keylen);
1070         nbytes += key->keylen;
1071
1072         sg_mark_end(&sg[block - 1]);
1073
1074         /* Now store the Hash into the packet */
1075         err = crypto_hash_init(desc);
1076         if (err)
1077                 goto clear_hash;
1078         err = crypto_hash_update(desc, sg, nbytes);
1079         if (err)
1080                 goto clear_hash;
1081         err = crypto_hash_final(desc, md5_hash);
1082         if (err)
1083                 goto clear_hash;
1084
1085         /* Reset header, and free up the crypto */
1086         tcp_put_md5sig_pool();
1087         th->check = old_checksum;
1088
1089 out:
1090         return 0;
1091 clear_hash:
1092         tcp_put_md5sig_pool();
1093 clear_hash_noput:
1094         memset(md5_hash, 0, 16);
1095         goto out;
1096 }
1097
1098 int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1099                          struct sock *sk,
1100                          struct dst_entry *dst,
1101                          struct request_sock *req,
1102                          struct tcphdr *th, int protocol,
1103                          unsigned int tcplen)
1104 {
1105         __be32 saddr, daddr;
1106
1107         if (sk) {
1108                 saddr = inet_sk(sk)->saddr;
1109                 daddr = inet_sk(sk)->daddr;
1110         } else {
1111                 struct rtable *rt = (struct rtable *)dst;
1112                 BUG_ON(!rt);
1113                 saddr = rt->rt_src;
1114                 daddr = rt->rt_dst;
1115         }
1116         return tcp_v4_do_calc_md5_hash(md5_hash, key,
1117                                        saddr, daddr,
1118                                        th, protocol, tcplen);
1119 }
1120
1121 EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1122
1123 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1124 {
1125         /*
1126          * This gets called for each TCP segment that arrives
1127          * so we want to be efficient.
1128          * We have 3 drop cases:
1129          * o No MD5 hash and one expected.
1130          * o MD5 hash and we're not expecting one.
1131          * o MD5 hash and its wrong.
1132          */
1133         __u8 *hash_location = NULL;
1134         struct tcp_md5sig_key *hash_expected;
1135         const struct iphdr *iph = ip_hdr(skb);
1136         struct tcphdr *th = tcp_hdr(skb);
1137         int length = (th->doff << 2) - sizeof(struct tcphdr);
1138         int genhash;
1139         unsigned char *ptr;
1140         unsigned char newhash[16];
1141
1142         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1143
1144         /*
1145          * If the TCP option length is less than the TCP_MD5SIG
1146          * option length, then we can shortcut
1147          */
1148         if (length < TCPOLEN_MD5SIG) {
1149                 if (hash_expected)
1150                         return 1;
1151                 else
1152                         return 0;
1153         }
1154
1155         /* Okay, we can't shortcut - we have to grub through the options */
1156         ptr = (unsigned char *)(th + 1);
1157         while (length > 0) {
1158                 int opcode = *ptr++;
1159                 int opsize;
1160
1161                 switch (opcode) {
1162                 case TCPOPT_EOL:
1163                         goto done_opts;
1164                 case TCPOPT_NOP:
1165                         length--;
1166                         continue;
1167                 default:
1168                         opsize = *ptr++;
1169                         if (opsize < 2)
1170                                 goto done_opts;
1171                         if (opsize > length)
1172                                 goto done_opts;
1173
1174                         if (opcode == TCPOPT_MD5SIG) {
1175                                 hash_location = ptr;
1176                                 goto done_opts;
1177                         }
1178                 }
1179                 ptr += opsize-2;
1180                 length -= opsize;
1181         }
1182 done_opts:
1183         /* We've parsed the options - do we have a hash? */
1184         if (!hash_expected && !hash_location)
1185                 return 0;
1186
1187         if (hash_expected && !hash_location) {
1188                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1189                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1190                                NIPQUAD(iph->saddr), ntohs(th->source),
1191                                NIPQUAD(iph->daddr), ntohs(th->dest));
1192                 return 1;
1193         }
1194
1195         if (!hash_expected && hash_location) {
1196                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1197                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1198                                NIPQUAD(iph->saddr), ntohs(th->source),
1199                                NIPQUAD(iph->daddr), ntohs(th->dest));
1200                 return 1;
1201         }
1202
1203         /* Okay, so this is hash_expected and hash_location -
1204          * so we need to calculate the checksum.
1205          */
1206         genhash = tcp_v4_do_calc_md5_hash(newhash,
1207                                           hash_expected,
1208                                           iph->saddr, iph->daddr,
1209                                           th, sk->sk_protocol,
1210                                           skb->len);
1211
1212         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1213                 if (net_ratelimit()) {
1214                         printk(KERN_INFO "MD5 Hash failed for "
1215                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1216                                NIPQUAD(iph->saddr), ntohs(th->source),
1217                                NIPQUAD(iph->daddr), ntohs(th->dest),
1218                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1219                 }
1220                 return 1;
1221         }
1222         return 0;
1223 }
1224
1225 #endif
1226
1227 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1228         .family         =       PF_INET,
1229         .obj_size       =       sizeof(struct tcp_request_sock),
1230         .rtx_syn_ack    =       tcp_v4_send_synack,
1231         .send_ack       =       tcp_v4_reqsk_send_ack,
1232         .destructor     =       tcp_v4_reqsk_destructor,
1233         .send_reset     =       tcp_v4_send_reset,
1234 };
1235
1236 #ifdef CONFIG_TCP_MD5SIG
1237 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1238         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1239 };
1240 #endif
1241
1242 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1243         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1244         .twsk_unique    = tcp_twsk_unique,
1245         .twsk_destructor= tcp_twsk_destructor,
1246 };
1247
1248 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249 {
1250         struct inet_request_sock *ireq;
1251         struct tcp_options_received tmp_opt;
1252         struct request_sock *req;
1253         __be32 saddr = ip_hdr(skb)->saddr;
1254         __be32 daddr = ip_hdr(skb)->daddr;
1255         __u32 isn = TCP_SKB_CB(skb)->when;
1256         struct dst_entry *dst = NULL;
1257 #ifdef CONFIG_SYN_COOKIES
1258         int want_cookie = 0;
1259 #else
1260 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1261 #endif
1262
1263         /* Never answer to SYNs send to broadcast or multicast */
1264         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1265                 goto drop;
1266
1267         /* TW buckets are converted to open requests without
1268          * limitations, they conserve resources and peer is
1269          * evidently real one.
1270          */
1271         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1272 #ifdef CONFIG_SYN_COOKIES
1273                 if (sysctl_tcp_syncookies) {
1274                         want_cookie = 1;
1275                 } else
1276 #endif
1277                 goto drop;
1278         }
1279
1280         /* Accept backlog is full. If we have already queued enough
1281          * of warm entries in syn queue, drop request. It is better than
1282          * clogging syn queue with openreqs with exponentially increasing
1283          * timeout.
1284          */
1285         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1286                 goto drop;
1287
1288         req = reqsk_alloc(&tcp_request_sock_ops);
1289         if (!req)
1290                 goto drop;
1291
1292 #ifdef CONFIG_TCP_MD5SIG
1293         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1294 #endif
1295
1296         tcp_clear_options(&tmp_opt);
1297         tmp_opt.mss_clamp = 536;
1298         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1299
1300         tcp_parse_options(skb, &tmp_opt, 0);
1301
1302         if (want_cookie) {
1303                 tcp_clear_options(&tmp_opt);
1304                 tmp_opt.saw_tstamp = 0;
1305         }
1306
1307         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1308                 /* Some OSes (unknown ones, but I see them on web server, which
1309                  * contains information interesting only for windows'
1310                  * users) do not send their stamp in SYN. It is easy case.
1311                  * We simply do not advertise TS support.
1312                  */
1313                 tmp_opt.saw_tstamp = 0;
1314                 tmp_opt.tstamp_ok  = 0;
1315         }
1316         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1317
1318         tcp_openreq_init(req, &tmp_opt, skb);
1319
1320         if (security_inet_conn_request(sk, skb, req))
1321                 goto drop_and_free;
1322
1323         ireq = inet_rsk(req);
1324         ireq->loc_addr = daddr;
1325         ireq->rmt_addr = saddr;
1326         ireq->opt = tcp_v4_save_options(sk, skb);
1327         if (!want_cookie)
1328                 TCP_ECN_create_request(req, tcp_hdr(skb));
1329
1330         if (want_cookie) {
1331 #ifdef CONFIG_SYN_COOKIES
1332                 syn_flood_warning(skb);
1333 #endif
1334                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335         } else if (!isn) {
1336                 struct inet_peer *peer = NULL;
1337
1338                 /* VJ's idea. We save last timestamp seen
1339                  * from the destination in peer table, when entering
1340                  * state TIME-WAIT, and check against it before
1341                  * accepting new connection request.
1342                  *
1343                  * If "isn" is not zero, this request hit alive
1344                  * timewait bucket, so that all the necessary checks
1345                  * are made in the function processing timewait state.
1346                  */
1347                 if (tmp_opt.saw_tstamp &&
1348                     tcp_death_row.sysctl_tw_recycle &&
1349                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1350                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1351                     peer->v4daddr == saddr) {
1352                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1353                             (s32)(peer->tcp_ts - req->ts_recent) >
1354                                                         TCP_PAWS_WINDOW) {
1355                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1356                                 goto drop_and_release;
1357                         }
1358                 }
1359                 /* Kill the following clause, if you dislike this way. */
1360                 else if (!sysctl_tcp_syncookies &&
1361                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1362                           (sysctl_max_syn_backlog >> 2)) &&
1363                          (!peer || !peer->tcp_ts_stamp) &&
1364                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1365                         /* Without syncookies last quarter of
1366                          * backlog is filled with destinations,
1367                          * proven to be alive.
1368                          * It means that we continue to communicate
1369                          * to destinations, already remembered
1370                          * to the moment of synflood.
1371                          */
1372                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1373                                        "request from %u.%u.%u.%u/%u\n",
1374                                        NIPQUAD(saddr),
1375                                        ntohs(tcp_hdr(skb)->source));
1376                         goto drop_and_release;
1377                 }
1378
1379                 isn = tcp_v4_init_sequence(skb);
1380         }
1381         tcp_rsk(req)->snt_isn = isn;
1382
1383         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1384                 goto drop_and_free;
1385
1386         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387         return 0;
1388
1389 drop_and_release:
1390         dst_release(dst);
1391 drop_and_free:
1392         reqsk_free(req);
1393 drop:
1394         return 0;
1395 }
1396
1397
1398 /*
1399  * The three way handshake has completed - we got a valid synack -
1400  * now create the new socket.
1401  */
1402 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1403                                   struct request_sock *req,
1404                                   struct dst_entry *dst)
1405 {
1406         struct inet_request_sock *ireq;
1407         struct inet_sock *newinet;
1408         struct tcp_sock *newtp;
1409         struct sock *newsk;
1410 #ifdef CONFIG_TCP_MD5SIG
1411         struct tcp_md5sig_key *key;
1412 #endif
1413
1414         if (sk_acceptq_is_full(sk))
1415                 goto exit_overflow;
1416
1417         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1418                 goto exit;
1419
1420         newsk = tcp_create_openreq_child(sk, req, skb);
1421         if (!newsk)
1422                 goto exit;
1423
1424         newsk->sk_gso_type = SKB_GSO_TCPV4;
1425         sk_setup_caps(newsk, dst);
1426
1427         newtp                 = tcp_sk(newsk);
1428         newinet               = inet_sk(newsk);
1429         ireq                  = inet_rsk(req);
1430         newinet->daddr        = ireq->rmt_addr;
1431         newinet->rcv_saddr    = ireq->loc_addr;
1432         newinet->saddr        = ireq->loc_addr;
1433         newinet->opt          = ireq->opt;
1434         ireq->opt             = NULL;
1435         newinet->mc_index     = inet_iif(skb);
1436         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1437         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1438         if (newinet->opt)
1439                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1440         newinet->id = newtp->write_seq ^ jiffies;
1441
1442         tcp_mtup_init(newsk);
1443         tcp_sync_mss(newsk, dst_mtu(dst));
1444         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1445         tcp_initialize_rcv_mss(newsk);
1446
1447 #ifdef CONFIG_TCP_MD5SIG
1448         /* Copy over the MD5 key from the original socket */
1449         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1450                 /*
1451                  * We're using one, so create a matching key
1452                  * on the newsk structure. If we fail to get
1453                  * memory, then we end up not copying the key
1454                  * across. Shucks.
1455                  */
1456                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1457                 if (newkey != NULL)
1458                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1459                                           newkey, key->keylen);
1460         }
1461 #endif
1462
1463         __inet_hash_nolisten(newsk);
1464         __inet_inherit_port(sk, newsk);
1465
1466         return newsk;
1467
1468 exit_overflow:
1469         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1470 exit:
1471         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1472         dst_release(dst);
1473         return NULL;
1474 }
1475
1476 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1477 {
1478         struct tcphdr *th = tcp_hdr(skb);
1479         const struct iphdr *iph = ip_hdr(skb);
1480         struct sock *nsk;
1481         struct request_sock **prev;
1482         /* Find possible connection requests. */
1483         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1484                                                        iph->saddr, iph->daddr);
1485         if (req)
1486                 return tcp_check_req(sk, skb, req, prev);
1487
1488         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1489                         th->source, iph->daddr, th->dest, inet_iif(skb));
1490
1491         if (nsk) {
1492                 if (nsk->sk_state != TCP_TIME_WAIT) {
1493                         bh_lock_sock(nsk);
1494                         return nsk;
1495                 }
1496                 inet_twsk_put(inet_twsk(nsk));
1497                 return NULL;
1498         }
1499
1500 #ifdef CONFIG_SYN_COOKIES
1501         if (!th->rst && !th->syn && th->ack)
1502                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1503 #endif
1504         return sk;
1505 }
1506
1507 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1508 {
1509         const struct iphdr *iph = ip_hdr(skb);
1510
1511         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1512                 if (!tcp_v4_check(skb->len, iph->saddr,
1513                                   iph->daddr, skb->csum)) {
1514                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1515                         return 0;
1516                 }
1517         }
1518
1519         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1520                                        skb->len, IPPROTO_TCP, 0);
1521
1522         if (skb->len <= 76) {
1523                 return __skb_checksum_complete(skb);
1524         }
1525         return 0;
1526 }
1527
1528
1529 /* The socket must have it's spinlock held when we get
1530  * here.
1531  *
1532  * We have a potential double-lock case here, so even when
1533  * doing backlog processing we use the BH locking scheme.
1534  * This is because we cannot sleep with the original spinlock
1535  * held.
1536  */
1537 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1538 {
1539         struct sock *rsk;
1540 #ifdef CONFIG_TCP_MD5SIG
1541         /*
1542          * We really want to reject the packet as early as possible
1543          * if:
1544          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1545          *  o There is an MD5 option and we're not expecting one
1546          */
1547         if (tcp_v4_inbound_md5_hash(sk, skb))
1548                 goto discard;
1549 #endif
1550
1551         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1552                 TCP_CHECK_TIMER(sk);
1553                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1554                         rsk = sk;
1555                         goto reset;
1556                 }
1557                 TCP_CHECK_TIMER(sk);
1558                 return 0;
1559         }
1560
1561         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1562                 goto csum_err;
1563
1564         if (sk->sk_state == TCP_LISTEN) {
1565                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1566                 if (!nsk)
1567                         goto discard;
1568
1569                 if (nsk != sk) {
1570                         if (tcp_child_process(sk, nsk, skb)) {
1571                                 rsk = nsk;
1572                                 goto reset;
1573                         }
1574                         return 0;
1575                 }
1576         }
1577
1578         TCP_CHECK_TIMER(sk);
1579         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1580                 rsk = sk;
1581                 goto reset;
1582         }
1583         TCP_CHECK_TIMER(sk);
1584         return 0;
1585
1586 reset:
1587         tcp_v4_send_reset(rsk, skb);
1588 discard:
1589         kfree_skb(skb);
1590         /* Be careful here. If this function gets more complicated and
1591          * gcc suffers from register pressure on the x86, sk (in %ebx)
1592          * might be destroyed here. This current version compiles correctly,
1593          * but you have been warned.
1594          */
1595         return 0;
1596
1597 csum_err:
1598         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1599         goto discard;
1600 }
1601
1602 /*
1603  *      From tcp_input.c
1604  */
1605
1606 int tcp_v4_rcv(struct sk_buff *skb)
1607 {
1608         const struct iphdr *iph;
1609         struct tcphdr *th;
1610         struct sock *sk;
1611         int ret;
1612
1613         if (skb->pkt_type != PACKET_HOST)
1614                 goto discard_it;
1615
1616         /* Count it even if it's bad */
1617         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1618
1619         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1620                 goto discard_it;
1621
1622         th = tcp_hdr(skb);
1623
1624         if (th->doff < sizeof(struct tcphdr) / 4)
1625                 goto bad_packet;
1626         if (!pskb_may_pull(skb, th->doff * 4))
1627                 goto discard_it;
1628
1629         /* An explanation is required here, I think.
1630          * Packet length and doff are validated by header prediction,
1631          * provided case of th->doff==0 is eliminated.
1632          * So, we defer the checks. */
1633         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1634                 goto bad_packet;
1635
1636         th = tcp_hdr(skb);
1637         iph = ip_hdr(skb);
1638         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1639         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1640                                     skb->len - th->doff * 4);
1641         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1642         TCP_SKB_CB(skb)->when    = 0;
1643         TCP_SKB_CB(skb)->flags   = iph->tos;
1644         TCP_SKB_CB(skb)->sacked  = 0;
1645
1646         sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
1647                         th->source, iph->daddr, th->dest, inet_iif(skb));
1648         if (!sk)
1649                 goto no_tcp_socket;
1650
1651 process:
1652         if (sk->sk_state == TCP_TIME_WAIT)
1653                 goto do_time_wait;
1654
1655         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1656                 goto discard_and_relse;
1657         nf_reset(skb);
1658
1659         if (sk_filter(sk, skb))
1660                 goto discard_and_relse;
1661
1662         skb->dev = NULL;
1663
1664         bh_lock_sock_nested(sk);
1665         ret = 0;
1666         if (!sock_owned_by_user(sk)) {
1667 #ifdef CONFIG_NET_DMA
1668                 struct tcp_sock *tp = tcp_sk(sk);
1669                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1670                         tp->ucopy.dma_chan = get_softnet_dma();
1671                 if (tp->ucopy.dma_chan)
1672                         ret = tcp_v4_do_rcv(sk, skb);
1673                 else
1674 #endif
1675                 {
1676                         if (!tcp_prequeue(sk, skb))
1677                         ret = tcp_v4_do_rcv(sk, skb);
1678                 }
1679         } else
1680                 sk_add_backlog(sk, skb);
1681         bh_unlock_sock(sk);
1682
1683         sock_put(sk);
1684
1685         return ret;
1686
1687 no_tcp_socket:
1688         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1689                 goto discard_it;
1690
1691         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1692 bad_packet:
1693                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1694         } else {
1695                 tcp_v4_send_reset(NULL, skb);
1696         }
1697
1698 discard_it:
1699         /* Discard frame. */
1700         kfree_skb(skb);
1701         return 0;
1702
1703 discard_and_relse:
1704         sock_put(sk);
1705         goto discard_it;
1706
1707 do_time_wait:
1708         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1709                 inet_twsk_put(inet_twsk(sk));
1710                 goto discard_it;
1711         }
1712
1713         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1714                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1715                 inet_twsk_put(inet_twsk(sk));
1716                 goto discard_it;
1717         }
1718         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1719         case TCP_TW_SYN: {
1720                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1721                                                         &tcp_hashinfo,
1722                                                         iph->daddr, th->dest,
1723                                                         inet_iif(skb));
1724                 if (sk2) {
1725                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1726                         inet_twsk_put(inet_twsk(sk));
1727                         sk = sk2;
1728                         goto process;
1729                 }
1730                 /* Fall through to ACK */
1731         }
1732         case TCP_TW_ACK:
1733                 tcp_v4_timewait_ack(sk, skb);
1734                 break;
1735         case TCP_TW_RST:
1736                 goto no_tcp_socket;
1737         case TCP_TW_SUCCESS:;
1738         }
1739         goto discard_it;
1740 }
1741
1742 /* VJ's idea. Save last timestamp seen from this destination
1743  * and hold it at least for normal timewait interval to use for duplicate
1744  * segment detection in subsequent connections, before they enter synchronized
1745  * state.
1746  */
1747
1748 int tcp_v4_remember_stamp(struct sock *sk)
1749 {
1750         struct inet_sock *inet = inet_sk(sk);
1751         struct tcp_sock *tp = tcp_sk(sk);
1752         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1753         struct inet_peer *peer = NULL;
1754         int release_it = 0;
1755
1756         if (!rt || rt->rt_dst != inet->daddr) {
1757                 peer = inet_getpeer(inet->daddr, 1);
1758                 release_it = 1;
1759         } else {
1760                 if (!rt->peer)
1761                         rt_bind_peer(rt, 1);
1762                 peer = rt->peer;
1763         }
1764
1765         if (peer) {
1766                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1767                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1768                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1769                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1770                         peer->tcp_ts = tp->rx_opt.ts_recent;
1771                 }
1772                 if (release_it)
1773                         inet_putpeer(peer);
1774                 return 1;
1775         }
1776
1777         return 0;
1778 }
1779
1780 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1781 {
1782         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1783
1784         if (peer) {
1785                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1786
1787                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1788                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1789                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1790                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1791                         peer->tcp_ts       = tcptw->tw_ts_recent;
1792                 }
1793                 inet_putpeer(peer);
1794                 return 1;
1795         }
1796
1797         return 0;
1798 }
1799
1800 struct inet_connection_sock_af_ops ipv4_specific = {
1801         .queue_xmit        = ip_queue_xmit,
1802         .send_check        = tcp_v4_send_check,
1803         .rebuild_header    = inet_sk_rebuild_header,
1804         .conn_request      = tcp_v4_conn_request,
1805         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1806         .remember_stamp    = tcp_v4_remember_stamp,
1807         .net_header_len    = sizeof(struct iphdr),
1808         .setsockopt        = ip_setsockopt,
1809         .getsockopt        = ip_getsockopt,
1810         .addr2sockaddr     = inet_csk_addr2sockaddr,
1811         .sockaddr_len      = sizeof(struct sockaddr_in),
1812         .bind_conflict     = inet_csk_bind_conflict,
1813 #ifdef CONFIG_COMPAT
1814         .compat_setsockopt = compat_ip_setsockopt,
1815         .compat_getsockopt = compat_ip_getsockopt,
1816 #endif
1817 };
1818
1819 #ifdef CONFIG_TCP_MD5SIG
1820 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1821         .md5_lookup             = tcp_v4_md5_lookup,
1822         .calc_md5_hash          = tcp_v4_calc_md5_hash,
1823         .md5_add                = tcp_v4_md5_add_func,
1824         .md5_parse              = tcp_v4_parse_md5_keys,
1825 };
1826 #endif
1827
1828 /* NOTE: A lot of things set to zero explicitly by call to
1829  *       sk_alloc() so need not be done here.
1830  */
1831 static int tcp_v4_init_sock(struct sock *sk)
1832 {
1833         struct inet_connection_sock *icsk = inet_csk(sk);
1834         struct tcp_sock *tp = tcp_sk(sk);
1835
1836         skb_queue_head_init(&tp->out_of_order_queue);
1837         tcp_init_xmit_timers(sk);
1838         tcp_prequeue_init(tp);
1839
1840         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1841         tp->mdev = TCP_TIMEOUT_INIT;
1842
1843         /* So many TCP implementations out there (incorrectly) count the
1844          * initial SYN frame in their delayed-ACK and congestion control
1845          * algorithms that we must have the following bandaid to talk
1846          * efficiently to them.  -DaveM
1847          */
1848         tp->snd_cwnd = 2;
1849
1850         /* See draft-stevens-tcpca-spec-01 for discussion of the
1851          * initialization of these values.
1852          */
1853         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1854         tp->snd_cwnd_clamp = ~0;
1855         tp->mss_cache = 536;
1856
1857         tp->reordering = sysctl_tcp_reordering;
1858         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1859
1860         sk->sk_state = TCP_CLOSE;
1861
1862         sk->sk_write_space = sk_stream_write_space;
1863         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1864
1865         icsk->icsk_af_ops = &ipv4_specific;
1866         icsk->icsk_sync_mss = tcp_sync_mss;
1867 #ifdef CONFIG_TCP_MD5SIG
1868         tp->af_specific = &tcp_sock_ipv4_specific;
1869 #endif
1870
1871         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1872         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1873
1874         atomic_inc(&tcp_sockets_allocated);
1875
1876         return 0;
1877 }
1878
1879 int tcp_v4_destroy_sock(struct sock *sk)
1880 {
1881         struct tcp_sock *tp = tcp_sk(sk);
1882
1883         tcp_clear_xmit_timers(sk);
1884
1885         tcp_cleanup_congestion_control(sk);
1886
1887         /* Cleanup up the write buffer. */
1888         tcp_write_queue_purge(sk);
1889
1890         /* Cleans up our, hopefully empty, out_of_order_queue. */
1891         __skb_queue_purge(&tp->out_of_order_queue);
1892
1893 #ifdef CONFIG_TCP_MD5SIG
1894         /* Clean up the MD5 key list, if any */
1895         if (tp->md5sig_info) {
1896                 tcp_v4_clear_md5_list(sk);
1897                 kfree(tp->md5sig_info);
1898                 tp->md5sig_info = NULL;
1899         }
1900 #endif
1901
1902 #ifdef CONFIG_NET_DMA
1903         /* Cleans up our sk_async_wait_queue */
1904         __skb_queue_purge(&sk->sk_async_wait_queue);
1905 #endif
1906
1907         /* Clean prequeue, it must be empty really */
1908         __skb_queue_purge(&tp->ucopy.prequeue);
1909
1910         /* Clean up a referenced TCP bind bucket. */
1911         if (inet_csk(sk)->icsk_bind_hash)
1912                 inet_put_port(sk);
1913
1914         /*
1915          * If sendmsg cached page exists, toss it.
1916          */
1917         if (sk->sk_sndmsg_page) {
1918                 __free_page(sk->sk_sndmsg_page);
1919                 sk->sk_sndmsg_page = NULL;
1920         }
1921
1922         if (tp->defer_tcp_accept.request) {
1923                 reqsk_free(tp->defer_tcp_accept.request);
1924                 sock_put(tp->defer_tcp_accept.listen_sk);
1925                 sock_put(sk);
1926                 tp->defer_tcp_accept.listen_sk = NULL;
1927                 tp->defer_tcp_accept.request = NULL;
1928         }
1929
1930         atomic_dec(&tcp_sockets_allocated);
1931
1932         return 0;
1933 }
1934
1935 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1936
1937 #ifdef CONFIG_PROC_FS
1938 /* Proc filesystem TCP sock list dumping. */
1939
1940 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1941 {
1942         return hlist_empty(head) ? NULL :
1943                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1944 }
1945
1946 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1947 {
1948         return tw->tw_node.next ?
1949                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1950 }
1951
1952 static void *listening_get_next(struct seq_file *seq, void *cur)
1953 {
1954         struct inet_connection_sock *icsk;
1955         struct hlist_node *node;
1956         struct sock *sk = cur;
1957         struct tcp_iter_state* st = seq->private;
1958         struct net *net = st->net;
1959
1960         if (!sk) {
1961                 st->bucket = 0;
1962                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1963                 goto get_sk;
1964         }
1965
1966         ++st->num;
1967
1968         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1969                 struct request_sock *req = cur;
1970
1971                 icsk = inet_csk(st->syn_wait_sk);
1972                 req = req->dl_next;
1973                 while (1) {
1974                         while (req) {
1975                                 if (req->rsk_ops->family == st->family &&
1976                                     net_eq(sock_net(req->sk), net)) {
1977                                         cur = req;
1978                                         goto out;
1979                                 }
1980                                 req = req->dl_next;
1981                         }
1982                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1983                                 break;
1984 get_req:
1985                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1986                 }
1987                 sk        = sk_next(st->syn_wait_sk);
1988                 st->state = TCP_SEQ_STATE_LISTENING;
1989                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1990         } else {
1991                 icsk = inet_csk(sk);
1992                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1993                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1994                         goto start_req;
1995                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1996                 sk = sk_next(sk);
1997         }
1998 get_sk:
1999         sk_for_each_from(sk, node) {
2000                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2001                         cur = sk;
2002                         goto out;
2003                 }
2004                 icsk = inet_csk(sk);
2005                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2007 start_req:
2008                         st->uid         = sock_i_uid(sk);
2009                         st->syn_wait_sk = sk;
2010                         st->state       = TCP_SEQ_STATE_OPENREQ;
2011                         st->sbucket     = 0;
2012                         goto get_req;
2013                 }
2014                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2015         }
2016         if (++st->bucket < INET_LHTABLE_SIZE) {
2017                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2018                 goto get_sk;
2019         }
2020         cur = NULL;
2021 out:
2022         return cur;
2023 }
2024
2025 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2026 {
2027         void *rc = listening_get_next(seq, NULL);
2028
2029         while (rc && *pos) {
2030                 rc = listening_get_next(seq, rc);
2031                 --*pos;
2032         }
2033         return rc;
2034 }
2035
2036 static void *established_get_first(struct seq_file *seq)
2037 {
2038         struct tcp_iter_state* st = seq->private;
2039         struct net *net = st->net;
2040         void *rc = NULL;
2041
2042         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2043                 struct sock *sk;
2044                 struct hlist_node *node;
2045                 struct inet_timewait_sock *tw;
2046                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2047
2048                 read_lock_bh(lock);
2049                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2050                         if (sk->sk_family != st->family ||
2051                             !net_eq(sock_net(sk), net)) {
2052                                 continue;
2053                         }
2054                         rc = sk;
2055                         goto out;
2056                 }
2057                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2058                 inet_twsk_for_each(tw, node,
2059                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2060                         if (tw->tw_family != st->family ||
2061                             !net_eq(twsk_net(tw), net)) {
2062                                 continue;
2063                         }
2064                         rc = tw;
2065                         goto out;
2066                 }
2067                 read_unlock_bh(lock);
2068                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2069         }
2070 out:
2071         return rc;
2072 }
2073
2074 static void *established_get_next(struct seq_file *seq, void *cur)
2075 {
2076         struct sock *sk = cur;
2077         struct inet_timewait_sock *tw;
2078         struct hlist_node *node;
2079         struct tcp_iter_state* st = seq->private;
2080         struct net *net = st->net;
2081
2082         ++st->num;
2083
2084         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2085                 tw = cur;
2086                 tw = tw_next(tw);
2087 get_tw:
2088                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2089                         tw = tw_next(tw);
2090                 }
2091                 if (tw) {
2092                         cur = tw;
2093                         goto out;
2094                 }
2095                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2096                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2097
2098                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2099                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2100                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2101                 } else {
2102                         cur = NULL;
2103                         goto out;
2104                 }
2105         } else
2106                 sk = sk_next(sk);
2107
2108         sk_for_each_from(sk, node) {
2109                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2110                         goto found;
2111         }
2112
2113         st->state = TCP_SEQ_STATE_TIME_WAIT;
2114         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2115         goto get_tw;
2116 found:
2117         cur = sk;
2118 out:
2119         return cur;
2120 }
2121
2122 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2123 {
2124         void *rc = established_get_first(seq);
2125
2126         while (rc && pos) {
2127                 rc = established_get_next(seq, rc);
2128                 --pos;
2129         }
2130         return rc;
2131 }
2132
2133 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2134 {
2135         void *rc;
2136         struct tcp_iter_state* st = seq->private;
2137
2138         inet_listen_lock(&tcp_hashinfo);
2139         st->state = TCP_SEQ_STATE_LISTENING;
2140         rc        = listening_get_idx(seq, &pos);
2141
2142         if (!rc) {
2143                 inet_listen_unlock(&tcp_hashinfo);
2144                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2145                 rc        = established_get_idx(seq, pos);
2146         }
2147
2148         return rc;
2149 }
2150
2151 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2152 {
2153         struct tcp_iter_state* st = seq->private;
2154         st->state = TCP_SEQ_STATE_LISTENING;
2155         st->num = 0;
2156         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2157 }
2158
2159 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2160 {
2161         void *rc = NULL;
2162         struct tcp_iter_state* st;
2163
2164         if (v == SEQ_START_TOKEN) {
2165                 rc = tcp_get_idx(seq, 0);
2166                 goto out;
2167         }
2168         st = seq->private;
2169
2170         switch (st->state) {
2171         case TCP_SEQ_STATE_OPENREQ:
2172         case TCP_SEQ_STATE_LISTENING:
2173                 rc = listening_get_next(seq, v);
2174                 if (!rc) {
2175                         inet_listen_unlock(&tcp_hashinfo);
2176                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2177                         rc        = established_get_first(seq);
2178                 }
2179                 break;
2180         case TCP_SEQ_STATE_ESTABLISHED:
2181         case TCP_SEQ_STATE_TIME_WAIT:
2182                 rc = established_get_next(seq, v);
2183                 break;
2184         }
2185 out:
2186         ++*pos;
2187         return rc;
2188 }
2189
2190 static void tcp_seq_stop(struct seq_file *seq, void *v)
2191 {
2192         struct tcp_iter_state* st = seq->private;
2193
2194         switch (st->state) {
2195         case TCP_SEQ_STATE_OPENREQ:
2196                 if (v) {
2197                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2198                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2199                 }
2200         case TCP_SEQ_STATE_LISTENING:
2201                 if (v != SEQ_START_TOKEN)
2202                         inet_listen_unlock(&tcp_hashinfo);
2203                 break;
2204         case TCP_SEQ_STATE_TIME_WAIT:
2205         case TCP_SEQ_STATE_ESTABLISHED:
2206                 if (v)
2207                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2208                 break;
2209         }
2210 }
2211
2212 static int tcp_seq_open(struct inode *inode, struct file *file)
2213 {
2214         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2215         struct seq_file *seq;
2216         struct tcp_iter_state *s;
2217         struct net *net;
2218         int rc;
2219
2220         if (unlikely(afinfo == NULL))
2221                 return -EINVAL;
2222
2223         s = kzalloc(sizeof(*s), GFP_KERNEL);
2224         if (!s)
2225                 return -ENOMEM;
2226
2227         rc = -ENXIO;
2228         net = get_proc_net(inode);
2229         if (!net)
2230                 goto out_kfree;
2231
2232         s->family               = afinfo->family;
2233         s->seq_ops.start        = tcp_seq_start;
2234         s->seq_ops.next         = tcp_seq_next;
2235         s->seq_ops.show         = afinfo->seq_show;
2236         s->seq_ops.stop         = tcp_seq_stop;
2237         s->net                  = net;
2238
2239         rc = seq_open(file, &s->seq_ops);
2240         if (rc)
2241                 goto out_put_net;
2242         seq = file->private_data;
2243         seq->private = s;
2244 out:
2245         return rc;
2246 out_put_net:
2247         put_net(net);
2248 out_kfree:
2249         kfree(s);
2250         goto out;
2251 }
2252
2253 static int tcp_seq_release(struct inode *inode, struct file *file)
2254 {
2255         struct seq_file *seq = file->private_data;
2256         struct tcp_iter_state *s = seq->private;
2257
2258         put_net(s->net);
2259         seq_release_private(inode, file);
2260         return 0;
2261 }
2262
2263 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2264 {
2265         int rc = 0;
2266         struct proc_dir_entry *p;
2267
2268         if (!afinfo)
2269                 return -EINVAL;
2270         afinfo->seq_fops->owner         = afinfo->owner;
2271         afinfo->seq_fops->open          = tcp_seq_open;
2272         afinfo->seq_fops->read          = seq_read;
2273         afinfo->seq_fops->llseek        = seq_lseek;
2274         afinfo->seq_fops->release       = tcp_seq_release;
2275
2276         p = proc_net_fops_create(net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2277         if (p)
2278                 p->data = afinfo;
2279         else
2280                 rc = -ENOMEM;
2281         return rc;
2282 }
2283
2284 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2285 {
2286         if (!afinfo)
2287                 return;
2288         proc_net_remove(net, afinfo->name);
2289         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2290 }
2291
2292 static void get_openreq4(struct sock *sk, struct request_sock *req,
2293                          char *tmpbuf, int i, int uid)
2294 {
2295         const struct inet_request_sock *ireq = inet_rsk(req);
2296         int ttd = req->expires - jiffies;
2297
2298         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2299                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2300                 i,
2301                 ireq->loc_addr,
2302                 ntohs(inet_sk(sk)->sport),
2303                 ireq->rmt_addr,
2304                 ntohs(ireq->rmt_port),
2305                 TCP_SYN_RECV,
2306                 0, 0, /* could print option size, but that is af dependent. */
2307                 1,    /* timers active (only the expire timer) */
2308                 jiffies_to_clock_t(ttd),
2309                 req->retrans,
2310                 uid,
2311                 0,  /* non standard timer */
2312                 0, /* open_requests have no inode */
2313                 atomic_read(&sk->sk_refcnt),
2314                 req);
2315 }
2316
2317 static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2318 {
2319         int timer_active;
2320         unsigned long timer_expires;
2321         struct tcp_sock *tp = tcp_sk(sk);
2322         const struct inet_connection_sock *icsk = inet_csk(sk);
2323         struct inet_sock *inet = inet_sk(sk);
2324         __be32 dest = inet->daddr;
2325         __be32 src = inet->rcv_saddr;
2326         __u16 destp = ntohs(inet->dport);
2327         __u16 srcp = ntohs(inet->sport);
2328
2329         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2330                 timer_active    = 1;
2331                 timer_expires   = icsk->icsk_timeout;
2332         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2333                 timer_active    = 4;
2334                 timer_expires   = icsk->icsk_timeout;
2335         } else if (timer_pending(&sk->sk_timer)) {
2336                 timer_active    = 2;
2337                 timer_expires   = sk->sk_timer.expires;
2338         } else {
2339                 timer_active    = 0;
2340                 timer_expires = jiffies;
2341         }
2342
2343         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2344                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2345                 i, src, srcp, dest, destp, sk->sk_state,
2346                 tp->write_seq - tp->snd_una,
2347                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2348                                              (tp->rcv_nxt - tp->copied_seq),
2349                 timer_active,
2350                 jiffies_to_clock_t(timer_expires - jiffies),
2351                 icsk->icsk_retransmits,
2352                 sock_i_uid(sk),
2353                 icsk->icsk_probes_out,
2354                 sock_i_ino(sk),
2355                 atomic_read(&sk->sk_refcnt), sk,
2356                 icsk->icsk_rto,
2357                 icsk->icsk_ack.ato,
2358                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2359                 tp->snd_cwnd,
2360                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2361 }
2362
2363 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2364                                char *tmpbuf, int i)
2365 {
2366         __be32 dest, src;
2367         __u16 destp, srcp;
2368         int ttd = tw->tw_ttd - jiffies;
2369
2370         if (ttd < 0)
2371                 ttd = 0;
2372
2373         dest  = tw->tw_daddr;
2374         src   = tw->tw_rcv_saddr;
2375         destp = ntohs(tw->tw_dport);
2376         srcp  = ntohs(tw->tw_sport);
2377
2378         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2379                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2380                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2381                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2382                 atomic_read(&tw->tw_refcnt), tw);
2383 }
2384
2385 #define TMPSZ 150
2386
2387 static int tcp4_seq_show(struct seq_file *seq, void *v)
2388 {
2389         struct tcp_iter_state* st;
2390         char tmpbuf[TMPSZ + 1];
2391
2392         if (v == SEQ_START_TOKEN) {
2393                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2394                            "  sl  local_address rem_address   st tx_queue "
2395                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2396                            "inode");
2397                 goto out;
2398         }
2399         st = seq->private;
2400
2401         switch (st->state) {
2402         case TCP_SEQ_STATE_LISTENING:
2403         case TCP_SEQ_STATE_ESTABLISHED:
2404                 get_tcp4_sock(v, tmpbuf, st->num);
2405                 break;
2406         case TCP_SEQ_STATE_OPENREQ:
2407                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2408                 break;
2409         case TCP_SEQ_STATE_TIME_WAIT:
2410                 get_timewait4_sock(v, tmpbuf, st->num);
2411                 break;
2412         }
2413         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2414 out:
2415         return 0;
2416 }
2417
2418 static struct file_operations tcp4_seq_fops;
2419 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2420         .owner          = THIS_MODULE,
2421         .name           = "tcp",
2422         .family         = AF_INET,
2423         .seq_show       = tcp4_seq_show,
2424         .seq_fops       = &tcp4_seq_fops,
2425 };
2426
2427 static int tcp4_proc_init_net(struct net *net)
2428 {
2429         return tcp_proc_register(net, &tcp4_seq_afinfo);
2430 }
2431
2432 static void tcp4_proc_exit_net(struct net *net)
2433 {
2434         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2435 }
2436
2437 static struct pernet_operations tcp4_net_ops = {
2438         .init = tcp4_proc_init_net,
2439         .exit = tcp4_proc_exit_net,
2440 };
2441
2442 int __init tcp4_proc_init(void)
2443 {
2444         return register_pernet_subsys(&tcp4_net_ops);
2445 }
2446
2447 void tcp4_proc_exit(void)
2448 {
2449         unregister_pernet_subsys(&tcp4_net_ops);
2450 }
2451 #endif /* CONFIG_PROC_FS */
2452
2453 struct proto tcp_prot = {
2454         .name                   = "TCP",
2455         .owner                  = THIS_MODULE,
2456         .close                  = tcp_close,
2457         .connect                = tcp_v4_connect,
2458         .disconnect             = tcp_disconnect,
2459         .accept                 = inet_csk_accept,
2460         .ioctl                  = tcp_ioctl,
2461         .init                   = tcp_v4_init_sock,
2462         .destroy                = tcp_v4_destroy_sock,
2463         .shutdown               = tcp_shutdown,
2464         .setsockopt             = tcp_setsockopt,
2465         .getsockopt             = tcp_getsockopt,
2466         .recvmsg                = tcp_recvmsg,
2467         .backlog_rcv            = tcp_v4_do_rcv,
2468         .hash                   = inet_hash,
2469         .unhash                 = inet_unhash,
2470         .get_port               = inet_csk_get_port,
2471         .enter_memory_pressure  = tcp_enter_memory_pressure,
2472         .sockets_allocated      = &tcp_sockets_allocated,
2473         .orphan_count           = &tcp_orphan_count,
2474         .memory_allocated       = &tcp_memory_allocated,
2475         .memory_pressure        = &tcp_memory_pressure,
2476         .sysctl_mem             = sysctl_tcp_mem,
2477         .sysctl_wmem            = sysctl_tcp_wmem,
2478         .sysctl_rmem            = sysctl_tcp_rmem,
2479         .max_header             = MAX_TCP_HEADER,
2480         .obj_size               = sizeof(struct tcp_sock),
2481         .twsk_prot              = &tcp_timewait_sock_ops,
2482         .rsk_prot               = &tcp_request_sock_ops,
2483         .h.hashinfo             = &tcp_hashinfo,
2484 #ifdef CONFIG_COMPAT
2485         .compat_setsockopt      = compat_tcp_setsockopt,
2486         .compat_getsockopt      = compat_tcp_getsockopt,
2487 #endif
2488 };
2489
2490
2491 static int __net_init tcp_sk_init(struct net *net)
2492 {
2493         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2494                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2495 }
2496
2497 static void __net_exit tcp_sk_exit(struct net *net)
2498 {
2499         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2500 }
2501
2502 static struct pernet_operations __net_initdata tcp_sk_ops = {
2503        .init = tcp_sk_init,
2504        .exit = tcp_sk_exit,
2505 };
2506
2507 void __init tcp_v4_init(void)
2508 {
2509         if (register_pernet_device(&tcp_sk_ops))
2510                 panic("Failed to create the TCP control socket.\n");
2511 }
2512
2513 EXPORT_SYMBOL(ipv4_specific);
2514 EXPORT_SYMBOL(tcp_hashinfo);
2515 EXPORT_SYMBOL(tcp_prot);
2516 EXPORT_SYMBOL(tcp_v4_conn_request);
2517 EXPORT_SYMBOL(tcp_v4_connect);
2518 EXPORT_SYMBOL(tcp_v4_do_rcv);
2519 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2520 EXPORT_SYMBOL(tcp_v4_send_check);
2521 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2522
2523 #ifdef CONFIG_PROC_FS
2524 EXPORT_SYMBOL(tcp_proc_register);
2525 EXPORT_SYMBOL(tcp_proc_unregister);
2526 #endif
2527 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2528