dccp ccid-3: Always perform receiver RTT sampling

[linux-2.6-omap-h63xx.git] / net / dccp / ccids / ccid2.c
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c

index c7d83e3c1648ceaf63d49d9867f554cbda042fa7..fa713227c66f10bb6be9518d0accbecea2a28257 100644 (file)
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -34,51 +34,8 @@
  #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
  static int ccid2_debug;
  #define ccid2_pr_debug(format, a...)   DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
-{
-       int len = 0;
-       int pipe = 0;
-       struct ccid2_seq *seqp = hctx->seqh;
-
-       /* there is data in the chain */
-       if (seqp != hctx->seqt) {
-               seqp = seqp->ccid2s_prev;
-               len++;
-               if (!seqp->ccid2s_acked)
-                       pipe++;
-
-               while (seqp != hctx->seqt) {
-                       struct ccid2_seq *prev = seqp->ccid2s_prev;
-
-                       len++;
-                       if (!prev->ccid2s_acked)
-                               pipe++;
-
-                       /* packets are sent sequentially */
-                       BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
-                                               prev->ccid2s_seq ) >= 0);
-                       BUG_ON(time_before(seqp->ccid2s_sent,
-                                          prev->ccid2s_sent));
-
-                       seqp = prev;
-               }
-       }
-
-       BUG_ON(pipe != hctx->pipe);
-       ccid2_pr_debug("len of chain=%d\n", len);
-
-       do {
-               seqp = seqp->ccid2s_prev;
-               len++;
-       } while (seqp != hctx->seqh);
-
-       ccid2_pr_debug("total len=%d\n", len);
-       BUG_ON(len != hctx->seqbufc * CCID2_SEQBUF_LEN);
-}
  #else
  #define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hctx)
  #endif
  
  static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -153,20 +110,11 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
         dp->dccps_l_ack_ratio = val;
  }
  
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
-{
-       ccid2_pr_debug("change SRTT to %ld\n", val);
-       hctx->srtt = val;
-}
-
-static void ccid2_start_rto_timer(struct sock *sk);
-
  static void ccid2_hc_tx_rto_expire(unsigned long data)
  {
         struct sock *sk = (struct sock *)data;
         struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
         const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
-       long s;
  
         bh_lock_sock(sk);
         if (sock_owned_by_user(sk)) {
@@ -176,14 +124,10 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
  
         ccid2_pr_debug("RTO_EXPIRE\n");
  
-       ccid2_hc_tx_check_sanity(hctx);
-
         /* back-off timer */
         hctx->rto <<= 1;
-
-       s = hctx->rto / HZ;
-       if (s > 60)
-               hctx->rto = 60 * HZ;
+       if (hctx->rto > DCCP_RTO_MAX)
+               hctx->rto = DCCP_RTO_MAX;
  
         /* adjust pipe, cwnd etc */
         hctx->ssthresh = hctx->cwnd / 2;
@@ -200,28 +144,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
         hctx->rpseq    = 0;
         hctx->rpdupack = -1;
         ccid2_change_l_ack_ratio(sk, 1);
-       ccid2_hc_tx_check_sanity(hctx);
  
         /* if we were blocked before, we may now send cwnd=1 packet */
         if (sender_was_blocked)
                 tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-       ccid2_start_rto_timer(sk);
+       /* restart backed-off timer */
+       sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
  out:
         bh_unlock_sock(sk);
         sock_put(sk);
  }
  
-static void ccid2_start_rto_timer(struct sock *sk)
-{
-       struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-       ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->rto);
-
-       BUG_ON(timer_pending(&hctx->rtotimer));
-       sk_reset_timer(sk, &hctx->rtotimer,
-                      jiffies + hctx->rto);
-}
-
  static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
  {
         struct dccp_sock *dp = dccp_sk(sk);
@@ -300,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
  
         /* setup RTO timer */
         if (!timer_pending(&hctx->rtotimer))
-               ccid2_start_rto_timer(sk);
+               sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
  
  #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
         do {
@@ -314,21 +247,90 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
                 }
         } while (0);
         ccid2_pr_debug("=========\n");
-       ccid2_hc_tx_check_sanity(hctx);
  #endif
  }
  
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ *   which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
  {
         struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+       long m = mrtt ? : 1;
  
-       sk_stop_timer(sk, &hctx->rtotimer);
-       ccid2_pr_debug("deleted RTO timer\n");
+       if (hctx->srtt == 0) {
+               /* First measurement m */
+               hctx->srtt = m << 3;
+               hctx->mdev = m << 1;
+
+               hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
+               hctx->rttvar   = hctx->mdev_max;
+               hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
+       } else {
+               /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+               m -= (hctx->srtt >> 3);
+               hctx->srtt += m;
+
+               /* Similarly, update scaled mdev with regard to |m| */
+               if (m < 0) {
+                       m = -m;
+                       m -= (hctx->mdev >> 2);
+                       /*
+                        * This neutralises RTO increase when RTT < SRTT - mdev
+                        * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+                        * in Linux TCP", USENIX 2002, pp. 49-62).
+                        */
+                       if (m > 0)
+                               m >>= 3;
+               } else {
+                       m -= (hctx->mdev >> 2);
+               }
+               hctx->mdev += m;
+
+               if (hctx->mdev > hctx->mdev_max) {
+                       hctx->mdev_max = hctx->mdev;
+                       if (hctx->mdev_max > hctx->rttvar)
+                               hctx->rttvar = hctx->mdev_max;
+               }
+
+               /*
+                * Decay RTTVAR at most once per flight, exploiting that
+                *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
+                *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
+                * GAR is a useful bound for FlightSize = pipe, AWL is probably
+                * too low as it over-estimates pipe.
+                */
+               if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
+                       if (hctx->mdev_max < hctx->rttvar)
+                               hctx->rttvar -= (hctx->rttvar -
+                                                hctx->mdev_max) >> 2;
+                       hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
+                       hctx->mdev_max = TCP_RTO_MIN;
+               }
+       }
+
+       /*
+        * Set RTO from SRTT and RTTVAR
+        * Clock granularity is ignored since the minimum error for RTTVAR is
+        * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
+        * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
+        * does not retransmit data, DCCP does not require TCP's recommended
+        * minimum timeout of one second".
+        */
+       hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
+
+       if (hctx->rto > DCCP_RTO_MAX)
+               hctx->rto = DCCP_RTO_MAX;
  }
  
-static inline void ccid2_new_ack(struct sock *sk,
-                                struct ccid2_seq *seqp,
-                                unsigned int *maxincr)
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+                         unsigned int *maxincr)
  {
         struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
  
@@ -342,81 +344,15 @@ static inline void ccid2_new_ack(struct sock *sk,
                         hctx->cwnd += 1;
                         hctx->packets_acked = 0;
         }
-
-       /* update RTO */
-       if (hctx->srtt == -1 ||
-           time_after(jiffies, hctx->lastrtt + hctx->srtt)) {
-               unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
-               int s;
-
-               /* first measurement */
-               if (hctx->srtt == -1) {
-                       ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
-                                      r, jiffies,
-                                      (unsigned long long)seqp->ccid2s_seq);
-                       ccid2_change_srtt(hctx, r);
-                       hctx->rttvar = r >> 1;
-               } else {
-                       /* RTTVAR */
-                       long tmp = hctx->srtt - r;
-                       long srtt;
-
-                       if (tmp < 0)
-                               tmp *= -1;
-
-                       tmp >>= 2;
-                       hctx->rttvar *= 3;
-                       hctx->rttvar >>= 2;
-                       hctx->rttvar += tmp;
-
-                       /* SRTT */
-                       srtt = hctx->srtt;
-                       srtt *= 7;
-                       srtt >>= 3;
-                       tmp = r >> 3;
-                       srtt += tmp;
-                       ccid2_change_srtt(hctx, srtt);
-               }
-               s = hctx->rttvar << 2;
-               /* clock granularity is 1 when based on jiffies */
-               if (!s)
-                       s = 1;
-               hctx->rto = hctx->srtt + s;
-
-               /* must be at least a second */
-               s = hctx->rto / HZ;
-               /* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
-               if (s < 1)
-                       hctx->rto = HZ;
-#endif
-               /* max 60 seconds */
-               if (s > 60)
-                       hctx->rto = HZ * 60;
-
-               hctx->lastrtt = jiffies;
-
-               ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
-                              hctx->srtt, hctx->rttvar,
-                              hctx->rto, HZ, r);
-       }
-
-       /* we got a new ack, so re-start RTO timer */
-       ccid2_hc_tx_kill_rto_timer(sk);
-       ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
-       struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-
-       if (hctx->pipe == 0)
-               DCCP_BUG("pipe == 0");
-       else
-               hctx->pipe--;
-
-       if (hctx->pipe == 0)
-               ccid2_hc_tx_kill_rto_timer(sk);
+       /*
+        * FIXME: RTT is sampled several times per acknowledgment (for each
+        * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+        * This causes the RTT to be over-estimated, since the older entries
+        * in the Ack Vector have earlier sending times.
+        * The cleanest solution is to not use the ccid2s_sent field at all
+        * and instead use DCCP timestamps - need to be resolved at some time.
+        */
+       ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
  }
  
  static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -463,7 +399,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
         int done = 0;
         unsigned int maxincr = 0;
  
-       ccid2_hc_tx_check_sanity(hctx);
         /* check reverse path congestion */
         seqno = DCCP_SKB_CB(skb)->dccpd_seq;
  
@@ -566,7 +501,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
                                         seqp->ccid2s_acked = 1;
                                         ccid2_pr_debug("Got ack for %llu\n",
                                                        (unsigned long long)seqp->ccid2s_seq);
-                                       ccid2_hc_tx_dec_pipe(sk);
+                                       hctx->pipe--;
                                 }
                                 if (seqp == hctx->seqt) {
                                         done = 1;
@@ -622,7 +557,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
                                  * one ack vector.
                                  */
                                 ccid2_congestion_event(sk, seqp);
-                               ccid2_hc_tx_dec_pipe(sk);
+                               hctx->pipe--;
                         }
                         if (seqp == hctx->seqt)
                                 break;
@@ -640,7 +575,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
                 hctx->seqt = hctx->seqt->ccid2s_next;
         }
  
-       ccid2_hc_tx_check_sanity(hctx);
+       /* restart RTO timer if not all outstanding data has been acked */
+       if (hctx->pipe == 0)
+               sk_stop_timer(sk, &hctx->rtotimer);
+       else
+               sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
  done:
         /* check if incoming Acks allow pending packets to be sent */
         if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
@@ -657,12 +596,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
         /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
         hctx->ssthresh = ~0U;
  
-       /*
-        * RFC 4341, 5: "The cwnd parameter is initialized to at most four
-        * packets for new connections, following the rules from [RFC3390]".
-        * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
-        */
-       hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+       /* Use larger initial windows (RFC 3390, rfc2581bis) */
+       hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
  
         /* Make sure that Ack Ratio is enabled and within bounds. */
         max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
@@ -673,15 +608,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
         if (ccid2_hc_tx_alloc_seq(hctx))
                 return -ENOMEM;
  
-       hctx->rto        = 3 * HZ;
-       ccid2_change_srtt(hctx, -1);
-       hctx->rttvar    = -1;
+       hctx->rto       = DCCP_TIMEOUT_INIT;
         hctx->rpdupack  = -1;
         hctx->last_cong = jiffies;
         setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
         INIT_LIST_HEAD(&hctx->av_chunks);
-
-       ccid2_hc_tx_check_sanity(hctx);
         return 0;
  }
  
@@ -690,7 +621,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
         struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
         int i;
  
-       ccid2_hc_tx_kill_rto_timer(sk);
+       sk_stop_timer(sk, &hctx->rtotimer);
  
         for (i = 0; i < hctx->seqbufc; i++)
                 kfree(hctx->seqbuf[i]);