#define IsSackFrto() (sysctl_tcp_frto == 0x2)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
+static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+{
+ if (tp->ecn_flags&TCP_ECN_OK)
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (tcp_hdr(skb)->cwr)
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+{
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (tp->ecn_flags&TCP_ECN_OK) {
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ /* Funny extension: if ECT is not set on a segment,
+ * it is surely retransmit. It is not in ECN RFC,
+ * but Linux follows this rule. */
+ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+ tcp_enter_quickack_mode((struct sock *)tp);
+ }
+}
+
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
+ return 1;
+ return 0;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
tcp_grow_window(sk, skb);
}
+static u32 tcp_rto_min(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ u32 rto_min = TCP_RTO_MIN;
+
+ if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
+ rto_min = dst->metrics[RTAX_RTO_MIN-1];
+ return rto_min;
+}
+
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
if (tp->mdev_max < tp->rttvar)
tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
tp->rtt_seq = tp->snd_nxt;
- tp->mdev_max = TCP_RTO_MIN;
+ tp->mdev_max = tcp_rto_min(sk);
}
} else {
/* no previous measure. */
tp->srtt = m<<3; /* take the measured time to be rtt */
tp->mdev = m<<1; /* make sure rto = 3*rtt */
- tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
tp->rtt_seq = tp->snd_nxt;
}
}
}
}
-/* Numbers are taken from RFC2414. */
+/* Numbers are taken from RFC3390.
+ *
+ * John Heffner states:
+ *
+ * The RFC specifies a window of no more than 4380 bytes
+ * unless 2*MSS > 4380. Reading the pseudocode in the RFC
+ * is a bit misleading because they use a clamp at 4380 bytes
+ * rather than use a multiplier in the relevant range.
+ */
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
* Both of these heuristics are not used in Loss state, when we cannot
* account for retransmits accurately.
*/
+static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
+{
+ u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));
+ u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));
+ int dup_sack = 0;
+
+ if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+ dup_sack = 1;
+ tp->rx_opt.sack_ok |= 4;
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1) {
+ u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));
+ u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));
+
+ if (!after(end_seq_0, end_seq_1) &&
+ !before(start_seq_0, start_seq_1)) {
+ dup_sack = 1;
+ tp->rx_opt.sack_ok |= 4;
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+ }
+ }
+
+ /* D-SACK for already forgotten data... Do dumb counting. */
+ if (dup_sack &&
+ !after(end_seq_0, prior_snd_una) &&
+ after(end_seq_0, tp->undo_marker))
+ tp->undo_retrans--;
+
+ return dup_sack;
+}
+
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
int i;
int first_sack_index;
- if (!tp->sacked_out)
+ if (!tp->sacked_out) {
tp->fackets_out = 0;
+ tp->highest_sack = tp->snd_una;
+ }
prior_fackets = tp->fackets_out;
- /* Check for D-SACK. */
- if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) {
+ found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
+ num_sacks, prior_snd_una);
+ if (found_dup_sack)
flag |= FLAG_DSACKING_ACK;
- found_dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
- } else if (num_sacks > 1 &&
- !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) &&
- !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) {
- flag |= FLAG_DSACKING_ACK;
- found_dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
- }
-
- /* D-SACK for already forgotten data...
- * Do dumb counting. */
- if (found_dup_sack &&
- !after(ntohl(sp[0].end_seq), prior_snd_una) &&
- after(ntohl(sp[0].end_seq), tp->undo_marker))
- tp->undo_retrans--;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
+
+ if (after(TCP_SKB_CB(skb)->seq,
+ tp->highest_sack))
+ tp->highest_sack = TCP_SKB_CB(skb)->seq;
} else {
if (dup_sack && (sacked&TCPCB_RETRANS))
reord = min(fack_count, reord);
}
}
- tp->left_out = tp->sacked_out + tp->lost_out;
-
if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
(!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 holes;
+
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
+
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
+ }
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->sacked_out++;
+ tcp_check_reno_reordering(sk, 0);
+ tcp_verify_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ if (acked-1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked-1;
+ }
+ tcp_check_reno_reordering(sk, acked);
+ tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
+}
+
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
/* Earlier loss recovery underway (see RFC4138; Appendix B).
* The last condition is necessary at least in tp->frto_counter case.
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt = 0;
- tp->sacked_out = 0;
tp->lost_out = 0;
- tp->fackets_out = 0;
tp->retrans_out = 0;
+ if (IsReno(tp))
+ tcp_reset_reno_sack(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
- cnt += tcp_skb_pcount(skb);
/*
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
} else {
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
}
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
- /* Do not mark those segments lost that were
- * forward transmitted after RTO
- */
- if (!after(TCP_SKB_CB(skb)->end_seq,
- tp->frto_highmark)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- } else {
- tp->sacked_out += tcp_skb_pcount(skb);
- tp->fackets_out = cnt;
+ /* Don't lost mark skbs that were fwd transmitted after RTO */
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) &&
+ !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
void tcp_clear_retrans(struct tcp_sock *tp)
{
- tp->left_out = 0;
tp->retrans_out = 0;
tp->fackets_out = 0;
tp->fackets_out = cnt;
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
return 0;
}
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* RFC: This is from the original, I doubt that this is necessary at all:
+ * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
+ * retransmitted past LOST markings in the first place? I'm not fully sure
+ * about undo and end of connection cases, which can cause R without L?
*/
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
+ struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 holes;
-
- holes = max(tp->lost_out, 1U);
- holes = min(holes, tp->packets_out);
-
- if ((tp->sacked_out + holes) > tp->packets_out) {
- tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
- }
-}
-
-/* Emulate SACKs for SACKless connection: account for a new dupack. */
-
-static void tcp_add_reno_sack(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- tcp_sync_left_out(tp);
-}
-
-/* Account for ACK, ACKing some data in Reno Recovery phase. */
-
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (acked > 0) {
- /* One ACK acked hole. The rest eat duplicate ACKs. */
- if (acked-1 >= tp->sacked_out)
- tp->sacked_out = 0;
- else
- tp->sacked_out -= acked-1;
- }
- tcp_check_reno_reordering(sk, acked);
- tcp_sync_left_out(tp);
-}
-
-static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
-{
- tp->sacked_out = 0;
- tp->left_out = tp->lost_out;
+ if ((tp->retransmit_skb_hint != NULL) &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = NULL;
}
/* Mark head of queue up as lost. */
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-
- /* clear xmit_retransmit_queue hints
- * if this is beyond hint */
- if (tp->retransmit_skb_hint != NULL &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
- tp->retransmit_skb_hint = NULL;
-
+ tcp_verify_retransmit_hint(tp, skb);
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
}
/* Account newly detected lost packet(s) */
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-
- /* clear xmit_retrans hint */
- if (tp->retransmit_skb_hint &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-
- tp->retransmit_skb_hint = NULL;
+ tcp_verify_retransmit_hint(tp, skb);
}
}
tp->scoreboard_skb_hint = skb;
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
}
}
printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
msg,
NIPQUAD(inet->daddr), ntohs(inet->dport),
- tp->snd_cwnd, tp->left_out,
+ tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
DBGUNDO(sk, "partial loss");
tp->lost_out = 0;
- tp->left_out = tp->sacked_out;
tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
inet_csk(sk)->icsk_retransmits = 0;
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_sync_left_out(tp);
-
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
- if (tp->left_out || tp->retrans_out || tp->undo_marker)
+ if (tp->sacked_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
- /* D. Synchronize left_out to current state. */
- tcp_sync_left_out(tp);
+ /* D. Check consistency of the current state. */
+ tcp_verify_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
__u32 dval = min(tp->fackets_out, packets_acked);
tp->fackets_out -= dval;
}
+ /* hint's skb might be NULL but we don't need to care */
+ tp->fastpath_cnt_hint -= min_t(u32, packets_acked,
+ tp->fastpath_cnt_hint);
tp->packets_out -= packets_acked;
BUG_ON(tcp_skb_pcount(skb) == 0);
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
/* Duplicate the behavior from Loss state (fastretrans_alert) */
if (flag&FLAG_DATA_ACKED)