net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 = TCP_RETR1;
  32 int sysctl_tcp_retries2 = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 void tcp_init_xmit_timers(struct sock *sk)
  40 {
  41         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
  42                                   &tcp_keepalive_timer);
  43 }
  44
  45 EXPORT_SYMBOL(tcp_init_xmit_timers);
  46
  47 static void tcp_write_err(struct sock *sk)
  48 {
  49         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  50         sk->sk_error_report(sk);
  51
  52         tcp_done(sk);
  53         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  54 }
  55
  56 /* Do not allow orphaned sockets to eat all our resources.
  57  * This is direct violation of TCP specs, but it is required
  58  * to prevent DoS attacks. It is called when a retransmission timeout
  59  * or zero probe timeout occurs on orphaned socket.
  60  *
  61  * Criterium is still not confirmed experimentally and may change.
  62  * We kill the socket, if:
  63  * 1. If number of orphaned sockets exceeds an administratively configured
  64  *    limit.
  65  * 2. If we have strong memory pressure.
  66  */
  67 static int tcp_out_of_resources(struct sock *sk, int do_reset)
  68 {
  69         struct tcp_sock *tp = tcp_sk(sk);
  70         int orphans = atomic_read(&tcp_orphan_count);
  71
  72         /* If peer does not open window for long time, or did not transmit
  73          * anything for long time, penalize it. */
  74         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
  75                 orphans <<= 1;
  76
  77         /* If some dubious ICMP arrived, penalize even more. */
  78         if (sk->sk_err_soft)
  79                 orphans <<= 1;
  80
  81         if (orphans >= sysctl_tcp_max_orphans ||
  82             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
  83              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
  84                 if (net_ratelimit())
  85                         printk(KERN_INFO "Out of socket memory\n");
  86
  87                 /* Catch exceptional cases, when connection requires reset.
  88                  *      1. Last segment was sent recently. */
  89                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
  90                     /*  2. Window is closed. */
  91                     (!tp->snd_wnd && !tp->packets_out))
  92                         do_reset = 1;
  93                 if (do_reset)
  94                         tcp_send_active_reset(sk, GFP_ATOMIC);
  95                 tcp_done(sk);
  96                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
  97                 return 1;
  98         }
  99         return 0;
 100 }
 101
 102 /* Calculate maximal number or retries on an orphaned socket. */
 103 static int tcp_orphan_retries(struct sock *sk, int alive)
 104 {
 105         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 106
 107         /* We know from an ICMP that something is wrong. */
 108         if (sk->sk_err_soft && !alive)
 109                 retries = 0;
 110
 111         /* However, if socket sent something recently, select some safe
 112          * number of retries. 8 corresponds to >100 seconds with minimal
 113          * RTO of 200msec. */
 114         if (retries == 0 && alive)
 115                 retries = 8;
 116         return retries;
 117 }
 118
 119 /* A write timeout has occurred. Process the after effects. */
 120 static int tcp_write_timeout(struct sock *sk)
 121 {
 122         const struct inet_connection_sock *icsk = inet_csk(sk);
 123         int retry_until;
 124
 125         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 126                 if (icsk->icsk_retransmits)
 127                         dst_negative_advice(&sk->sk_dst_cache);
 128                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 129         } else {
 130                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 131                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 132                            hole detection. :-(
 133
 134                            It is place to make it. It is not made. I do not want
 135                            to make it. It is disguisting. It does not work in any
 136                            case. Let me to cite the same draft, which requires for
 137                            us to implement this:
 138
 139    "The one security concern raised by this memo is that ICMP black holes
 140    are often caused by over-zealous security administrators who block
 141    all ICMP messages.  It is vitally important that those who design and
 142    deploy security systems understand the impact of strict filtering on
 143    upper-layer protocols.  The safest web site in the world is worthless
 144    if most TCP implementations cannot transfer data from it.  It would
 145    be far nicer to have all of the black holes fixed rather than fixing
 146    all of the TCP implementations."
 147
 148                            Golden words :-).
 149                    */
 150
 151                         dst_negative_advice(&sk->sk_dst_cache);
 152                 }
 153
 154                 retry_until = sysctl_tcp_retries2;
 155                 if (sock_flag(sk, SOCK_DEAD)) {
 156                         const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 157
 158                         retry_until = tcp_orphan_retries(sk, alive);
 159
 160                         if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 161                                 return 1;
 162                 }
 163         }
 164
 165         if (icsk->icsk_retransmits >= retry_until) {
 166                 /* Has it gone just too far? */
 167                 tcp_write_err(sk);
 168                 return 1;
 169         }
 170         return 0;
 171 }
 172
 173 static void tcp_delack_timer(unsigned long data)
 174 {
 175         struct sock *sk = (struct sock*)data;
 176         struct tcp_sock *tp = tcp_sk(sk);
 177         struct inet_connection_sock *icsk = inet_csk(sk);
 178
 179         bh_lock_sock(sk);
 180         if (sock_owned_by_user(sk)) {
 181                 /* Try again later. */
 182                 icsk->icsk_ack.blocked = 1;
 183                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 184                 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 185                 goto out_unlock;
 186         }
 187
 188         sk_stream_mem_reclaim(sk);
 189
 190         if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 191                 goto out;
 192
 193         if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 194                 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 195                 goto out;
 196         }
 197         icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 198
 199         if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 200                 struct sk_buff *skb;
 201
 202                 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 203
 204                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 205                         sk->sk_backlog_rcv(sk, skb);
 206
 207                 tp->ucopy.memory = 0;
 208         }
 209
 210         if (inet_csk_ack_scheduled(sk)) {
 211                 if (!icsk->icsk_ack.pingpong) {
 212                         /* Delayed ACK missed: inflate ATO. */
 213                         icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 214                 } else {
 215                         /* Delayed ACK missed: leave pingpong mode and
 216                          * deflate ATO.
 217                          */
 218                         icsk->icsk_ack.pingpong = 0;
 219                         icsk->icsk_ack.ato      = TCP_ATO_MIN;
 220                 }
 221                 tcp_send_ack(sk);
 222                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 223         }
 224         TCP_CHECK_TIMER(sk);
 225
 226 out:
 227         if (tcp_memory_pressure)
 228                 sk_stream_mem_reclaim(sk);
 229 out_unlock:
 230         bh_unlock_sock(sk);
 231         sock_put(sk);
 232 }
 233
 234 static void tcp_probe_timer(struct sock *sk)
 235 {
 236         struct tcp_sock *tp = tcp_sk(sk);
 237         int max_probes;
 238
 239         if (tp->packets_out || !sk->sk_send_head) {
 240                 tp->probes_out = 0;
 241                 return;
 242         }
 243
 244         /* *WARNING* RFC 1122 forbids this
 245          *
 246          * It doesn't AFAIK, because we kill the retransmit timer -AK
 247          *
 248          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 249          * this behaviour in Solaris down as a bug fix. [AC]
 250          *
 251          * Let me to explain. probes_out is zeroed by incoming ACKs
 252          * even if they advertise zero window. Hence, connection is killed only
 253          * if we received no ACKs for normal connection timeout. It is not killed
 254          * only because window stays zero for some time, window may be zero
 255          * until armageddon and even later. We are in full accordance
 256          * with RFCs, only probe timer combines both retransmission timeout
 257          * and probe timeout in one bottle.                             --ANK
 258          */
 259         max_probes = sysctl_tcp_retries2;
 260
 261         if (sock_flag(sk, SOCK_DEAD)) {
 262                 const struct inet_connection_sock *icsk = inet_csk(sk);
 263                 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 264
 265                 max_probes = tcp_orphan_retries(sk, alive);
 266
 267                 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
 268                         return;
 269         }
 270
 271         if (tp->probes_out > max_probes) {
 272                 tcp_write_err(sk);
 273         } else {
 274                 /* Only send another probe if we didn't close things up. */
 275                 tcp_send_probe0(sk);
 276         }
 277 }
 278
 279 /*
 280  *      The TCP retransmit timer.
 281  */
 282
 283 static void tcp_retransmit_timer(struct sock *sk)
 284 {
 285         struct tcp_sock *tp = tcp_sk(sk);
 286         struct inet_connection_sock *icsk = inet_csk(sk);
 287
 288         if (!tp->packets_out)
 289                 goto out;
 290
 291         BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
 292
 293         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 294             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 295                 /* Receiver dastardly shrinks window. Our retransmits
 296                  * become zero probes, but we should not timeout this
 297                  * connection. If the socket is an orphan, time it out,
 298                  * we cannot allow such beasts to hang infinitely.
 299                  */
 300 #ifdef TCP_DEBUG
 301                 if (net_ratelimit()) {
 302                         struct inet_sock *inet = inet_sk(sk);
 303                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 304                                NIPQUAD(inet->daddr), htons(inet->dport),
 305                                inet->num, tp->snd_una, tp->snd_nxt);
 306                 }
 307 #endif
 308                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 309                         tcp_write_err(sk);
 310                         goto out;
 311                 }
 312                 tcp_enter_loss(sk, 0);
 313                 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
 314                 __sk_dst_reset(sk);
 315                 goto out_reset_timer;
 316         }
 317
 318         if (tcp_write_timeout(sk))
 319                 goto out;
 320
 321         if (icsk->icsk_retransmits == 0) {
 322                 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
 323                         if (tp->rx_opt.sack_ok) {
 324                                 if (tp->ca_state == TCP_CA_Recovery)
 325                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 326                                 else
 327                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 328                         } else {
 329                                 if (tp->ca_state == TCP_CA_Recovery)
 330                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 331                                 else
 332                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 333                         }
 334                 } else if (tp->ca_state == TCP_CA_Loss) {
 335                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 336                 } else {
 337                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 338                 }
 339         }
 340
 341         if (tcp_use_frto(sk)) {
 342                 tcp_enter_frto(sk);
 343         } else {
 344                 tcp_enter_loss(sk, 0);
 345         }
 346
 347         if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
 348                 /* Retransmission failed because of local congestion,
 349                  * do not backoff.
 350                  */
 351                 if (!icsk->icsk_retransmits)
 352                         icsk->icsk_retransmits = 1;
 353                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 354                                           min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
 355                                           TCP_RTO_MAX);
 356                 goto out;
 357         }
 358
 359         /* Increase the timeout each time we retransmit.  Note that
 360          * we do not increase the rtt estimate.  rto is initialized
 361          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 362          * that doubling rto each time is the least we can get away with.
 363          * In KA9Q, Karn uses this for the first few times, and then
 364          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 365          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 366          * defined in the protocol as the maximum possible RTT.  I guess
 367          * we'll have to use something other than TCP to talk to the
 368          * University of Mars.
 369          *
 370          * PAWS allows us longer timeouts and large windows, so once
 371          * implemented ftp to mars will work nicely. We will have to fix
 372          * the 120 second clamps though!
 373          */
 374         icsk->icsk_backoff++;
 375         icsk->icsk_retransmits++;
 376
 377 out_reset_timer:
 378         icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 379         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
 380         if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 381                 __sk_dst_reset(sk);
 382
 383 out:;
 384 }
 385
 386 static void tcp_write_timer(unsigned long data)
 387 {
 388         struct sock *sk = (struct sock*)data;
 389         struct inet_connection_sock *icsk = inet_csk(sk);
 390         int event;
 391
 392         bh_lock_sock(sk);
 393         if (sock_owned_by_user(sk)) {
 394                 /* Try again later */
 395                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 396                 goto out_unlock;
 397         }
 398
 399         if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 400                 goto out;
 401
 402         if (time_after(icsk->icsk_timeout, jiffies)) {
 403                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 404                 goto out;
 405         }
 406
 407         event = icsk->icsk_pending;
 408         icsk->icsk_pending = 0;
 409
 410         switch (event) {
 411         case ICSK_TIME_RETRANS:
 412                 tcp_retransmit_timer(sk);
 413                 break;
 414         case ICSK_TIME_PROBE0:
 415                 tcp_probe_timer(sk);
 416                 break;
 417         }
 418         TCP_CHECK_TIMER(sk);
 419
 420 out:
 421         sk_stream_mem_reclaim(sk);
 422 out_unlock:
 423         bh_unlock_sock(sk);
 424         sock_put(sk);
 425 }
 426
 427 void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent,
 428                        const unsigned long interval, const unsigned long timeout,
 429                        const unsigned long max_rto, int max_retries)
 430 {
 431         struct inet_connection_sock *icsk = inet_csk(parent);
 432         struct listen_sock *lopt = queue->listen_opt;
 433         int thresh = max_retries;
 434         unsigned long now = jiffies;
 435         struct request_sock **reqp, *req;
 436         int i, budget;
 437
 438         if (lopt == NULL || lopt->qlen == 0)
 439                 return;
 440
 441         /* Normally all the openreqs are young and become mature
 442          * (i.e. converted to established socket) for first timeout.
 443          * If synack was not acknowledged for 3 seconds, it means
 444          * one of the following things: synack was lost, ack was lost,
 445          * rtt is high or nobody planned to ack (i.e. synflood).
 446          * When server is a bit loaded, queue is populated with old
 447          * open requests, reducing effective size of queue.
 448          * When server is well loaded, queue size reduces to zero
 449          * after several minutes of work. It is not synflood,
 450          * it is normal operation. The solution is pruning
 451          * too old entries overriding normal timeout, when
 452          * situation becomes dangerous.
 453          *
 454          * Essentially, we reserve half of room for young
 455          * embrions; and abort old ones without pity, if old
 456          * ones are about to clog our table.
 457          */
 458         if (lopt->qlen>>(lopt->max_qlen_log-1)) {
 459                 int young = (lopt->qlen_young<<1);
 460
 461                 while (thresh > 2) {
 462                         if (lopt->qlen < young)
 463                                 break;
 464                         thresh--;
 465                         young <<= 1;
 466                 }
 467         }
 468
 469         if (queue->rskq_defer_accept)
 470                 max_retries = queue->rskq_defer_accept;
 471
 472         budget = 2 * (lopt->nr_table_entries / (timeout / interval));
 473         i = lopt->clock_hand;
 474
 475         do {
 476                 reqp=&lopt->syn_table[i];
 477                 while ((req = *reqp) != NULL) {
 478                         if (time_after_eq(now, req->expires)) {
 479                                 if ((req->retrans < thresh ||
 480                                      (inet_rsk(req)->acked && req->retrans < max_retries))
 481                                     && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
 482                                         unsigned long timeo;
 483
 484                                         if (req->retrans++ == 0)
 485                                                 lopt->qlen_young--;
 486                                         timeo = min((timeout << req->retrans), max_rto);
 487                                         req->expires = now + timeo;
 488                                         reqp = &req->dl_next;
 489                                         continue;
 490                                 }
 491
 492                                 /* Drop this request */
 493                                 inet_csk_reqsk_queue_unlink(parent, req, reqp);
 494                                 reqsk_queue_removed(&icsk->icsk_accept_queue, req);
 495                                 reqsk_free(req);
 496                                 continue;
 497                         }
 498                         reqp = &req->dl_next;
 499                 }
 500
 501                 i = (i + 1) & (lopt->nr_table_entries - 1);
 502
 503         } while (--budget > 0);
 504
 505         lopt->clock_hand = i;
 506
 507         if (lopt->qlen)
 508                 inet_csk_reset_keepalive_timer(parent, interval);
 509 }
 510
 511 EXPORT_SYMBOL_GPL(reqsk_queue_prune);
 512
 513 /*
 514  *      Timer for listening sockets
 515  */
 516
 517 static void tcp_synack_timer(struct sock *sk)
 518 {
 519         struct inet_connection_sock *icsk = inet_csk(sk);
 520         const int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
 521
 522         reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL,
 523                           TCP_TIMEOUT_INIT, TCP_RTO_MAX, max_retries);
 524 }
 525
 526 void tcp_set_keepalive(struct sock *sk, int val)
 527 {
 528         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 529                 return;
 530
 531         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 532                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 533         else if (!val)
 534                 inet_csk_delete_keepalive_timer(sk);
 535 }
 536
 537
 538 static void tcp_keepalive_timer (unsigned long data)
 539 {
 540         struct sock *sk = (struct sock *) data;
 541         struct tcp_sock *tp = tcp_sk(sk);
 542         __u32 elapsed;
 543
 544         /* Only process if socket is not in use. */
 545         bh_lock_sock(sk);
 546         if (sock_owned_by_user(sk)) {
 547                 /* Try again later. */
 548                 inet_csk_reset_keepalive_timer (sk, HZ/20);
 549                 goto out;
 550         }
 551
 552         if (sk->sk_state == TCP_LISTEN) {
 553                 tcp_synack_timer(sk);
 554                 goto out;
 555         }
 556
 557         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 558                 if (tp->linger2 >= 0) {
 559                         const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 560
 561                         if (tmo > 0) {
 562                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 563                                 goto out;
 564                         }
 565                 }
 566                 tcp_send_active_reset(sk, GFP_ATOMIC);
 567                 goto death;
 568         }
 569
 570         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 571                 goto out;
 572
 573         elapsed = keepalive_time_when(tp);
 574
 575         /* It is alive without keepalive 8) */
 576         if (tp->packets_out || sk->sk_send_head)
 577                 goto resched;
 578
 579         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 580
 581         if (elapsed >= keepalive_time_when(tp)) {
 582                 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 583                      (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 584                         tcp_send_active_reset(sk, GFP_ATOMIC);
 585                         tcp_write_err(sk);
 586                         goto out;
 587                 }
 588                 if (tcp_write_wakeup(sk) <= 0) {
 589                         tp->probes_out++;
 590                         elapsed = keepalive_intvl_when(tp);
 591                 } else {
 592                         /* If keepalive was lost due to local congestion,
 593                          * try harder.
 594                          */
 595                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 596                 }
 597         } else {
 598                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 599                 elapsed = keepalive_time_when(tp) - elapsed;
 600         }
 601
 602         TCP_CHECK_TIMER(sk);
 603         sk_stream_mem_reclaim(sk);
 604
 605 resched:
 606         inet_csk_reset_keepalive_timer (sk, elapsed);
 607         goto out;
 608
 609 death:
 610         tcp_done(sk);
 611
 612 out:
 613         bh_unlock_sock(sk);
 614         sock_put(sk);
 615 }