X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=4c10d9cad20f594e2bd6a3a15b0c6d45554ece38;hb=b76892051cf1c04d95872838e70146f65e3b9d75;hp=b11bd162422725777e9116e9288e25f15fadb538;hpb=005903bc3a0e8473fef809e8775db52dcd3cde63;p=linux-2.6-omap-h63xx.git diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b11bd162422..4c10d9cad20 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; @@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -111,10 +112,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) -#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) -#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) -#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) - #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -860,6 +857,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) } } +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok &= ~2; +} + +/* Take a notice that peer is sending DSACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok |= 4; +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -881,7 +893,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -943,9 +955,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); - else if (IsReno(tp)) + else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); - else if (IsFack(tp)) + else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); @@ -957,8 +969,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - /* Disable FACK yet. */ - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); } } @@ -1009,7 +1020,86 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * ---------------------- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<------------+------+----- TCP seqno space --------------+---------->| + * ...-- <2^31 ->| |<--------... + * ...---- >2^31 ------>| |<--------... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. + * start_seq == snd_una is non-sensical (see comments above) + */ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: + * start_seq < undo_marker and end_seq >= undo_marker. + */ + return !before(start_seq, end_seq - tp->max_window); +} + + static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1020,7 +1110,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; - tp->rx_opt.sack_ok |= 4; + tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); @@ -1029,7 +1119,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { dup_sack = 1; - tp->rx_opt.sack_ok |= 4; + tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } } @@ -1151,6 +1241,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp->undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + continue; + } + skb = cached_skb; fack_count = cached_fack_count; @@ -1326,7 +1427,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ continue; if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && - (IsFack(tp) || + (tcp_is_fack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * tp->mss_cache))) { @@ -1346,6 +1447,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } + tcp_verify_left_out(tp); + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); @@ -1492,6 +1595,8 @@ void tcp_enter_frto(struct sock *sk) tp->undo_retrans = 0; skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1524,7 +1629,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->lost_out = 0; tp->retrans_out = 0; - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { @@ -1541,6 +1646,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } @@ -1556,7 +1663,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->undo_marker = 0; tp->frto_counter = 0; tp->reordering = min_t(unsigned int, tp->reordering, @@ -1565,7 +1671,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1605,10 +1711,14 @@ void tcp_enter_loss(struct sock *sk, int how) tp->bytes_acked = 0; tcp_clear_retrans(tp); - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - if (!how) + if (!how) { + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ tp->undo_marker = tp->snd_una; + tcp_clear_retrans_hints_partial(tp); + } else { + tcp_clear_all_retrans_hints(tp); + } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1635,8 +1745,6 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); /* Abort FRTO algorithm if one is in progress */ tp->frto_counter = 0; - - clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1666,7 +1774,7 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1870,7 +1978,7 @@ static void tcp_update_scoreboard(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (IsFack(tp)) { + if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; @@ -1884,7 +1992,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk)) { + if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1936,7 +2044,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) int decr = tp->snd_cwnd_cnt + 1; if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || - (IsReno(tp) && !(flag&FLAG_NOT_DUP))) { + (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { tp->snd_cwnd_cnt = decr&1; decr >>= 1; @@ -2001,7 +2109,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) /* There is something screwy going on with the retrans hints after an undo */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -2027,7 +2135,7 @@ static int tcp_try_undo_recovery(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } - if (tp->snd_una == tp->high_seq && IsReno(tp)) { + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ @@ -2057,7 +2165,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2094,7 +2202,7 @@ static int tcp_try_undo_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); DBGUNDO(sk, "partial loss"); tp->lost_out = 0; @@ -2102,7 +2210,7 @@ static int tcp_try_undo_loss(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; - if (!IsReno(tp)) + if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return 1; } @@ -2121,6 +2229,8 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); + tcp_verify_left_out(tp); + if (tp->retrans_out == 0) tp->retrans_stamp = 0; @@ -2130,7 +2240,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->sacked_out || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2183,7 +2293,7 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * tcp_xmit_retransmit_queue(). */ static void -tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) +tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2195,8 +2305,8 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tp->packets_out) tp->sacked_out = 0; - /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp->sacked_out == 0) + + if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. @@ -2247,14 +2357,14 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ - IsReno(tp) || tp->snd_una != tp->high_seq) { + tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; @@ -2267,14 +2377,10 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (IsReno(tp) && is_dupack) + if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else { - int acked = prior_packets - tp->packets_out; - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, acked); - do_lost = tcp_try_undo_partial(sk, acked); - } + } else + do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) @@ -2288,7 +2394,7 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) return; /* Loss is undone; fall through to processing in Open state. */ default: - if (IsReno(tp)) { + if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) @@ -2316,7 +2422,7 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) /* Otherwise enter Recovery state */ - if (IsReno(tp)) + if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); @@ -2414,8 +2520,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk) +static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2426,158 +2531,143 @@ static void tcp_ack_packets_out(struct sock *sk) } } -static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, - __u32 now, __s32 *seq_rtt) +/* If we get here, the whole TSO packet has not been acked. */ +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u32 seq = tp->snd_una; - __u32 packets_acked; - int acked = 0; + u32 packets_acked; - /* If we get here, the whole TSO packet has not been - * acked. - */ - BUG_ON(!after(scb->end_seq, seq)); + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); - if (tcp_trim_head(sk, skb, seq - scb->seq)) + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { - __u8 sacked = scb->sacked; - - acked |= FLAG_DATA_ACKED; - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; - *seq_rtt = -1; - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(seq, tp->snd_up)) - tp->urg_mode = 0; - } - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - - if (tp->fackets_out) { - __u32 dval = min(tp->fackets_out, packets_acked); - tp->fackets_out -= dval; - } - /* hint's skb might be NULL but we don't need to care */ - tp->fastpath_cnt_hint -= min_t(u32, packets_acked, - tp->fastpath_cnt_hint); - tp->packets_out -= packets_acked; - BUG_ON(tcp_skb_pcount(skb) == 0); - BUG_ON(!before(scb->seq, scb->end_seq)); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } - return acked; + return packets_acked; } -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - __u32 now = tcp_time_stamp; - int acked = 0; + u32 now = tcp_time_stamp; + int fully_acked = 1; + int flag = 0; int prior_packets = tp->packets_out; - __s32 seq_rtt = -1; + s32 seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); - while ((skb = tcp_write_queue_head(sk)) && - skb != tcp_send_head(sk)) { + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u8 sacked = scb->sacked; + u32 end_seq; + u32 packets_acked; + u8 sacked = scb->sacked; - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1 && - after(tp->snd_una, scb->seq)) - acked |= tcp_tso_acked(sk, skb, - now, &seq_rtt); - break; - } + if (tcp_skb_pcount(skb) == 1 || + !after(tp->snd_una, scb->seq)) + break; - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; + packets_acked = tcp_tso_acked(sk, skb); + if (!packets_acked) + break; + + fully_acked = 0; + end_seq = tp->snd_una; } else { - acked |= FLAG_SYN_ACKED; - tp->retrans_stamp = 0; + packets_acked = tcp_skb_pcount(skb); + end_seq = scb->end_seq; } /* MTU probing checks */ - if (icsk->icsk_mtup.probe_size) { - if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { - tcp_mtup_probe_success(sk, skb); - } + if (fully_acked && icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { + tcp_mtup_probe_success(sk, skb); } if (sacked) { if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(skb); - acked |= FLAG_RETRANS_DATA_ACKED; + tp->retrans_out -= packets_acked; + flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || + (packets_acked > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; } + if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= tcp_skb_pcount(skb); + tp->sacked_out -= packets_acked; if (sacked & TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(skb); - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(scb->end_seq, tp->snd_up)) - tp->urg_mode = 0; - } + tp->lost_out -= packets_acked; + + if ((sacked & TCPCB_URG) && tp->urg_mode && + !before(end_seq, tp->snd_up)) + tp->urg_mode = 0; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; + } + tp->packets_out -= packets_acked; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; } - tcp_dec_pcount_approx(&tp->fackets_out, skb); - tcp_packets_out_dec(tp, skb); + + if (!fully_acked) + break; + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } - if (acked&FLAG_ACKED) { + if (flag & FLAG_ACKED) { u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk); + tcp_ack_update_rtt(sk, flag, seq_rtt); + tcp_rearm_rto(sk); + + tp->fackets_out -= min(pkts_acked, tp->fackets_out); + /* hint's skb might be NULL but we don't need to care */ + tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, + tp->fastpath_cnt_hint); + if (tcp_is_reno(tp)) + tcp_remove_reno_sacks(sk, pkts_acked); if (ca_ops->pkts_acked) { s32 rtt_us = -1; /* Is the ACK triggering packet unambiguous? */ - if (!(acked & FLAG_RETRANS_DATA_ACKED)) { + if (!(flag & FLAG_RETRANS_DATA_ACKED)) { /* High resolution needed and available? */ if (ca_ops->flags & TCP_CONG_RTT_STAMP && !ktime_equal(last_ackt, @@ -2596,7 +2686,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); - if (!tp->packets_out && tp->rx_opt.sack_ok) { + if (!tp->packets_out && tcp_is_sack(tp)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", @@ -2616,7 +2706,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } #endif *seq_rtt_p = seq_rtt; - return acked; + return flag; } static void tcp_ack_probe(struct sock *sk) @@ -2771,12 +2861,16 @@ static int tcp_process_frto(struct sock *sk, int flag) if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || + ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) + tp->undo_marker = 0; + if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } - if (!IsSackFrto() || IsReno(tp)) { + if (!IsSackFrto() || tcp_is_reno(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate @@ -2835,6 +2929,7 @@ static int tcp_process_frto(struct sock *sk, int flag) break; } tp->frto_counter = 0; + tp->undo_marker = 0; } return 0; } @@ -2923,7 +3018,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_packets, flag); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight, 1); @@ -3260,7 +3355,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * Probably, we should reset in this case. For now drop them. */ __skb_queue_purge(&tp->out_of_order_queue); - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); @@ -3290,7 +3385,7 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_se static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); else @@ -3320,7 +3415,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -3636,7 +3731,7 @@ drop: if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = 1; @@ -3701,7 +3796,7 @@ drop: } add_sack: - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); } } @@ -3890,7 +3985,7 @@ static int tcp_prune_queue(struct sock *sk) * is in a sad state like this, we care only about integrity * of the connection not performance. */ - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); } @@ -4591,8 +4686,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tp->rx_opt.sack_ok && sysctl_tcp_fack) - tp->rx_opt.sack_ok |= 2; + if (tcp_is_sack(tp) && sysctl_tcp_fack) + tcp_enable_fack(tp); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);