From 375fe02c91792917aa26d68a87ab110d1937f44e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:45 -0700 Subject: [PATCH 1/4] tcp: consolidate SYNACK RTT sampling The first patch consolidates SYNACK and other RTT measurement to use a central function tcp_ack_update_rtt(). A (small) bonus is now SYNACK RTT measurement happens after PAWS check, potentially reducing the impact of RTO seeding on bad TCP timestamps values. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 9 --------- net/ipv4/tcp_input.c | 14 ++++++++++++-- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 8 ++------ net/ipv6/tcp_ipv6.c | 2 -- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d1980054ec75..f9777dbede75 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1094,15 +1094,6 @@ static inline void tcp_openreq_init(struct request_sock *req, ireq->loc_port = tcp_hdr(skb)->dest; } -/* Compute time elapsed between SYNACK and the ACK completing 3WHS */ -static inline void tcp_synack_rtt_meas(struct sock *sk, - struct request_sock *req) -{ - if (tcp_rsk(req)->snt_synack) - tcp_valid_rtt_meas(sk, - tcp_time_stamp - tcp_rsk(req)->snt_synack); -} - extern void tcp_enter_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28af45abe062..b531710596ec 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2853,6 +2853,17 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } +/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ +static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 seq_rtt = -1; + + if (tp->lsndtime && !tp->total_retrans) + seq_rtt = tcp_time_stamp - tp->lsndtime; + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt); +} + static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -5624,9 +5635,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * so release it. */ if (req) { - tcp_synack_rtt_meas(sk, req); tp->total_retrans = req->num_retrans; - reqsk_fastopen_remove(sk, req, false); } else { /* Make sure socket is routed, for correct metrics. */ @@ -5651,6 +5660,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + tcp_synack_rtt_meas(sk, req); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b299da5ff499..2e3f129df0eb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1671,8 +1671,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ab1c08658528..58a3e69aef64 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -411,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; + newtp->lsndtime = treq->snt_synack; + newtp->total_retrans = req->num_retrans; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -666,12 +668,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; - /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e1649d58533..80fe69ef2188 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1237,8 +1237,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); - tcp_synack_rtt_meas(newsk, req); - newtp->total_retrans = req->num_retrans; newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; From 5b08e47caf1f2034a3a5b566bbccc8b0be3961ca Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:46 -0700 Subject: [PATCH 2/4] tcp: prefer packet timing to TS-ECR for RTT Prefer packet timings to TS-ecr for RTT measurements when both sources are available. That's because broken middle-boxes and remote peer can return packets with corrupted TS ECR fields. Similarly most congestion controls that require RTT signals favor timing-based sources as well. Also check for bad TS ECR values to avoid RTT blow-ups. It has happened on production Web servers. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 67 ++++++++++++-------------------------------- 2 files changed, 18 insertions(+), 50 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f9777dbede75..c5868471abae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -591,7 +591,6 @@ extern void tcp_initialize_rcv_mss(struct sock *sk); extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); extern int tcp_mss_to_mtu(struct sock *sk, int mss); extern void tcp_mtup_init(struct sock *sk); -extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt); extern void tcp_init_buffer_space(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b531710596ec..c7398f05d12b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2792,65 +2792,36 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } -void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + s32 seq_rtt) { - tcp_rtt_estimator(sk, seq_rtt); - tcp_set_rto(sk); - inet_csk(sk)->icsk_backoff = 0; -} -EXPORT_SYMBOL(tcp_valid_rtt_meas); + const struct tcp_sock *tp = tcp_sk(sk); + + /* Prefer RTT measured from ACK's timing to TS-ECR. This is because + * broken middle-boxes or peers may corrupt TS-ECR fields. But + * Karn's algorithm forbids taking RTT if some retransmitted data + * is acked (RFC6298). + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + seq_rtt = -1; -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Supersedes RFC1323) - */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) -{ /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. - * * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin - * - * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overestimated rto. With timestamps - * samples are accepted even from very old segments: f.e., when rtt=1 - * increases to 8, we retransmit 5 times and after 8 seconds delayed - * answer arrives rto becomes 120 seconds! If at least one of segments - * in window is lost... Voila. --ANK (010210) */ - struct tcp_sock *tp = tcp_sk(sk); + if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr); -} - -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) -{ - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - - if (flag & FLAG_RETRANS_DATA_ACKED) + if (seq_rtt < 0) return; - tcp_valid_rtt_meas(sk, seq_rtt); -} + tcp_rtt_estimator(sk, seq_rtt); + tcp_set_rto(sk); -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt) -{ - const struct tcp_sock *tp = tcp_sk(sk); - /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + /* RFC6298: only reset backoff on valid RTT measurement. */ + inet_csk(sk)->icsk_backoff = 0; } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ @@ -2989,8 +2960,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; From 59c9af4234b0c21a1ed05cf65bf014d0c1a67bfd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:47 -0700 Subject: [PATCH 3/4] tcp: measure RTT from new SACK Take RTT sample if an ACK selectively acks some sequences that have never been retransmitted. The Karn's algorithm does not apply even if that ACK (s)acks other retransmitted sequences, because it must been generated by an original but perhaps out-of-order packet. There is no ambiguity. In case when multiple blocks are newly sacked because of ACK losses the earliest block is used to measure RTT, similar to cummulative ACKs. Such RTT samples allow the sender to estimate the RTO during loss recovery and packet reordering events. It is still useful even with TCP timestamps. That's because during these events the SND.UNA may not advance preventing RTT samples from TS ECR (thus the FLAG_ACKED check before calling tcp_ack_update_rtt()). Therefore this new RTT source is complementary to existing ACK and TS RTT mechanisms. This patch does not update the RTO. It is done in the next patch. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c7398f05d12b..b85bc7c3736a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1048,6 +1048,7 @@ struct tcp_sacktag_state { int reord; int fack_count; int flag; + s32 rtt; /* RTT measured by SACKing never-retransmitted data */ }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1108,7 +1109,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - bool dup_sack, int pcount) + int dup_sack, int pcount, u32 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; @@ -1148,6 +1149,9 @@ static u8 tcp_sacktag_one(struct sock *sk, state->reord); if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; + /* Pick the earliest sequence sacked for RTT */ + if (state->rtt < 0) + state->rtt = tcp_time_stamp - xmit_time; } if (sacked & TCPCB_LOST) { @@ -1205,7 +1209,8 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount); + start_seq, end_seq, dup_sack, pcount, + TCP_SKB_CB(skb)->when); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1479,7 +1484,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, - tcp_skb_pcount(skb)); + tcp_skb_pcount(skb), + TCP_SKB_CB(skb)->when); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1536,7 +1542,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una) + u32 prior_snd_una, s32 *sack_rtt) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1554,6 +1560,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; + state.rtt = -1; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1737,6 +1744,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif + *sack_rtt = state.rtt; return state.flag; } @@ -3254,6 +3262,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ + s32 sack_rtt = -1; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3310,7 +3319,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt); if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; @@ -3382,7 +3392,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, + &sack_rtt); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } From ed08495c31bb991de636d2488abaa50b39f2ff4a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 22 Jul 2013 16:20:48 -0700 Subject: [PATCH 4/4] tcp: use RTT from SACK for RTO If RTT is not available because Karn's check has failed or no new packet is acked, use the RTT measured from SACK to estimate the RTO. The sender can continue to estimate the RTO during loss recovery or reordering event upon receiving non-partial ACKs. This also changes when the RTO is re-armed. Previously it is only re-armed when some data is cummulatively acknowledged (i.e., SND.UNA advances), but now it is re-armed whenever RTT estimator is updated. This feature is particularly useful to reduce spurious timeout for buffer bloat including cellular carriers [1], and RTT estimation on reordering events. [1] "An In-depth Study of LTE: Effect of Network Protocol and Application Behavior on Performance", In Proc. of SIGCOMM 2013 Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b85bc7c3736a..b61274b666f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2800,8 +2800,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_xmit_retransmit_queue(sk); } -static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - s32 seq_rtt) +static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, + s32 seq_rtt, s32 sack_rtt) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2813,6 +2813,9 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, if (flag & FLAG_RETRANS_DATA_ACKED) seq_rtt = -1; + if (seq_rtt < 0) + seq_rtt = sack_rtt; + /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the @@ -2823,13 +2826,14 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; if (seq_rtt < 0) - return; + return false; tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ inet_csk(sk)->icsk_backoff = 0; + return true; } /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ @@ -2840,7 +2844,7 @@ static void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) if (tp->lsndtime && !tp->total_retrans) seq_rtt = tcp_time_stamp - tp->lsndtime; - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt); + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); } static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) @@ -2929,7 +2933,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una) + u32 prior_snd_una, s32 sack_rtt) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3019,6 +3023,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; + if (tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt) || + (flag & FLAG_ACKED)) + tcp_rearm_rto(sk); + if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -3028,9 +3036,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_mtup_probe_success(sk); } - tcp_ack_update_rtt(sk, flag, seq_rtt); - tcp_rearm_rto(sk); - if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); } else { @@ -3339,7 +3344,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) {