Update to bring the rack stack with all its fixes in.

This brings the rack stack up to the current level used at NF. Many fixes
and improvements have been added. I also add in a fix to BBR to deal with
the changes that have been in hpts for a while i.e. only one call no matter
if mbuf queue or tcp_output.

Note there is a new file that I can't figure out how to get in rack_pcm.c

It basically does little except BBlogs and is a placemark for future work on
doing path capacity measurements.

Reviewed by: tuexen, glebius
Sponsored by: Netflix Inc.
Differential Revision:https://reviews.freebsd.org/D43986
This commit is contained in:
Randall Stewart 2024-03-11 07:36:54 -04:00
parent 96c567f972
commit f6d489f402
14 changed files with 3581 additions and 1165 deletions

View file

@ -5,7 +5,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c
SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c rack_pcm.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_kern_tls.h

View file

@ -334,9 +334,22 @@ __tcp_set_flags(struct tcphdr *th, uint16_t flags)
#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */
#define TCP_RXT_CLAMP TCP_POLICER_DETECT
#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */
#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */
#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */
#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */
#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */
#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */
#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */
#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */
#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */
#define RACK_CSPR_IS_FCC 1161
#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@ -447,6 +460,7 @@ struct tcp_info {
u_int32_t tcpi_rcv_adv; /* Peer advertised window */
u_int32_t tcpi_dupacks; /* Consecutive dup ACKs recvd */
u_int32_t tcpi_rttmin; /* Min observed RTT */
/* Padding to grow without breaking ABI. */
u_int32_t __tcpi_pad[14]; /* Padding. */
};
@ -463,6 +477,20 @@ struct tcp_fastopen {
#define TCP_FUNCTION_NAME_LEN_MAX 32
struct stack_specific_info {
char stack_name[TCP_FUNCTION_NAME_LEN_MAX];
uint64_t policer_last_bw; /* Only valid if detection enabled and policer detected */
uint64_t bytes_transmitted;
uint64_t bytes_retransmitted;
uint32_t policer_detection_enabled: 1,
policer_detected : 1, /* transport thinks a policer is on path */
highly_buffered : 1, /* transport considers the path highly buffered */
spare : 29;
uint32_t policer_bucket_size; /* Only valid if detection enabled and policer detected */
uint32_t current_round;
uint32_t _rack_i_pad[18];
};
struct tcp_function_set {
char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
uint32_t pcbcnt;
@ -488,6 +516,7 @@ struct tcp_snd_req {
uint64_t start;
uint64_t end;
uint32_t flags;
uint32_t playout_ms;
};
union tcp_log_userdata {
@ -518,9 +547,12 @@ struct tcp_log_user {
#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
#define TCP_HAS_PLAYOUT_MS 0x0040 /* The client included the chunk playout milliseconds: deprecate */
/* the below are internal only flags */
#define TCP_HYBRID_PACING_USER_MASK 0x0FFF /* Non-internal flags mask */
#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tells us we set the mss on this entry */
#define TCP_HYBRID_PACING_WASSET 0x2000 /* We init to this to know if a hybrid command was issued */
#define TCP_HYBRID_PACING_SENDTIME 0x4000 /* Duplicate tm to last, use sendtime for catch up mode */
struct tcp_hybrid_req {
struct tcp_snd_req req;

View file

@ -267,7 +267,9 @@ enum tcp_log_events {
TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */
TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */
TCP_LOG_PRU, /* TCP protocol user request 70 */
TCP_LOG_END /* End (keep at end) 71 */
TCP_POLICER_DET, /* TCP Policer detectionn 71 */
TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */
TCP_LOG_END /* End (keep at end) 72 */
};
enum tcp_log_states {
@ -371,10 +373,11 @@ struct tcp_log_dev_log_queue {
#define TCP_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
#define TCP_TP_REQ_LOG_FAIL 0x00000005 /* We tried to allocate a Request log but had no space */
#define TCP_TP_RESET_RCV 0x00000006 /* Triggers when we receive a RST */
#define TCP_TP_EXCESS_RXT 0x00000007 /* When we get excess RXT's clamping the cwnd */
#define TCP_TP_POLICER_DET 0x00000007 /* When we detect a policer */
#define TCP_TP_EXCESS_RXT TCP_TP_POLICER_DET /* alias */
#define TCP_TP_SAD_TRIGGERED 0x00000008 /* Sack Attack Detection triggers */
#define TCP_TP_SAD_SUSPECT 0x0000000a /* A sack has supicious information in it */
#define TCP_TP_PACED_BOTTOM 0x0000000b /* We have paced at the bottom */
#ifdef _KERNEL

View file

@ -11529,7 +11529,9 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
bbr_set_pktepoch(bbr, cts, __LINE__);
bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
if (nxt_pkt == 0) {
if (bbr->r_wanted_output != 0) {
if ((bbr->r_wanted_output != 0) ||
(tp->t_flags & TF_ACKNOW)) {
bbr->rc_output_starts_timer = 0;
did_out = 1;
if (tcp_output(tp) < 0)

File diff suppressed because it is too large Load diff

View file

View file

@ -51,5 +51,10 @@ void sack_filter_clear(struct sack_filter *sf, tcp_seq seq);
int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
tcp_seq th_ack);
void sack_filter_reject(struct sack_filter *sf, struct sackblk *in);
static inline uint8_t sack_filter_blks_used(struct sack_filter *sf)
{
return (sf->sf_used);
}
#endif
#endif

View file

@ -65,7 +65,6 @@
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_accounting.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
@ -100,6 +99,7 @@
#include "sack_filter.h"
#include "tcp_rack.h"
#include "tailq_hash.h"
#include "opt_global.h"
struct rack_sendmap *
@ -107,7 +107,7 @@ tqhash_min(struct tailq_hash *hs)
{
struct rack_sendmap *rsm;
rsm = tqhash_find(hs, hs->min);
rsm = hs->rsm_min;
return(rsm);
}
@ -116,7 +116,7 @@ tqhash_max(struct tailq_hash *hs)
{
struct rack_sendmap *rsm;
rsm = tqhash_find(hs, (hs->max - 1));
rsm = hs->rsm_max;
return (rsm);
}
@ -224,13 +224,19 @@ tqhash_prev(struct tailq_hash *hs, struct rack_sendmap *rsm)
void
tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type)
{
TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
hs->count--;
if (hs->count == 0) {
hs->min = hs->max;
hs->rsm_max = hs->rsm_min = NULL;
} else if (type == REMOVE_TYPE_CUMACK) {
hs->min = rsm->r_end;
hs->rsm_min = tqhash_next(hs, rsm);
} else if (rsm == hs->rsm_max) {
hs->rsm_max = tqhash_prev(hs, rsm);
hs->max = hs->rsm_max->r_end;
}
TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
}
int
@ -240,6 +246,7 @@ tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm)
int inserted = 0;
uint32_t ebucket;
#ifdef INVARIANTS
if (hs->count > 0) {
if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) {
return (-1);
@ -249,6 +256,7 @@ tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm)
return (-2);
}
}
#endif
rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE;
rsm->bindex %= MAX_HASH_ENTRIES;
ebucket = rsm->r_end / SEQ_BUCKET_SIZE;
@ -263,13 +271,17 @@ tqhash_insert(struct tailq_hash *hs, struct rack_sendmap *rsm)
/* Special case */
hs->min = rsm->r_start;
hs->max = rsm->r_end;
hs->rsm_min = hs->rsm_max = rsm;
hs->count = 1;
} else {
hs->count++;
if (SEQ_GT(rsm->r_end, hs->max))
if (SEQ_GEQ(rsm->r_end, hs->max)) {
hs->max = rsm->r_end;
if (SEQ_LT(rsm->r_start, hs->min))
hs->rsm_max = rsm;
} if (SEQ_LEQ(rsm->r_start, hs->min)) {
hs->min = rsm->r_start;
hs->rsm_min = rsm;
}
}
/* Check the common case of inserting at the end */
l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head);
@ -299,6 +311,7 @@ tqhash_init(struct tailq_hash *hs)
TAILQ_INIT(&hs->ht[i]);
}
hs->min = hs->max = 0;
hs->rsm_min = hs->rsm_max = NULL;
hs->count = 0;
}
@ -339,3 +352,11 @@ tqhash_trim(struct tailq_hash *hs, uint32_t th_ack)
return (0);
}
void
tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm,
uint32_t th_ack)
{
if (hs->max == rsm->r_end)
hs->max = th_ack;
rsm->r_end = th_ack;
}

View file

@ -13,10 +13,12 @@
#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1))
struct tailq_hash {
struct rack_head ht[MAX_HASH_ENTRIES];
uint32_t min;
uint32_t max;
uint32_t count;
struct rack_sendmap *rsm_min;
struct rack_sendmap *rsm_max;
struct rack_head ht[MAX_HASH_ENTRIES];
};
struct rack_sendmap *
@ -53,6 +55,10 @@ tqhash_init(struct tailq_hash *hs);
int
tqhash_trim(struct tailq_hash *hs, uint32_t th_ack);
void
tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm,
uint32_t th_ack);
#define TQHASH_FOREACH(var, head) \
for ((var) = tqhash_min((head)); \

View file

@ -48,6 +48,8 @@
#define RACK_MERGED 0x080000/* The RSM was merged */
#define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */
#define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */
#define RACK_WAS_LOST 0x400000/* Is the rsm considered lost */
#define RACK_IS_PCM 0x800000/* A PCM measurement is being taken */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
@ -63,6 +65,7 @@ struct rack_sendmap {
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint32_t r_flags : 24, /* Flags as defined above */
r_rtr_cnt : 8; /* Retran count, index this -1 to get time */
uint32_t r_act_rxt_cnt; /* The actual total count of transmits */
struct mbuf *m;
uint32_t soff;
uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */
@ -174,6 +177,8 @@ struct rack_rtt_sample {
#define RACK_TO_FRM_PERSIST 5
#define RACK_TO_FRM_DELACK 6
#define RCV_PATH_RTT_MS 10 /* How many ms between recv path RTT's */
struct rack_opts_stats {
uint64_t tcp_rack_tlp_reduce;
uint64_t tcp_rack_pace_always;
@ -232,7 +237,7 @@ struct rack_opts_stats {
uint64_t tcp_rack_rtt_use;
uint64_t tcp_data_after_close;
uint64_t tcp_defer_opt;
uint64_t tcp_rxt_clamp;
uint64_t tcp_pol_detect;
uint64_t tcp_rack_beta;
uint64_t tcp_rack_beta_ecn;
uint64_t tcp_rack_timer_slop;
@ -242,6 +247,11 @@ struct rack_opts_stats {
uint64_t tcp_rack_pacing_divisor;
uint64_t tcp_rack_min_seg;
uint64_t tcp_dgp_in_rec;
uint64_t tcp_notimely;
uint64_t tcp_honor_hpts;
uint64_t tcp_dyn_rec;
uint64_t tcp_fillcw_rate_cap;
uint64_t tcp_pol_mss;
};
/* RTT shrink reasons */
@ -263,6 +273,9 @@ struct rack_opts_stats {
#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */
#define RACK_MIN_BW 8000 /* 64kbps in Bps */
#define CCSP_DIS_MASK 0x0001
#define HYBRID_DIS_MASK 0x0002
/* Rack quality indicators for GPUT measurements */
#define RACK_QUALITY_NONE 0 /* No quality stated */
#define RACK_QUALITY_HIGH 1 /* A normal measurement of a GP RTT */
@ -319,6 +332,7 @@ extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
*
*/
#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
#define RETRAN_CNT_SIZE 16
#define RACK_NUM_FSB_DEBUG 16
#ifdef _KERNEL
@ -342,6 +356,26 @@ struct rack_fast_send_blk {
struct tailq_hash;
struct rack_pcm_info {
/* Base send time and s/e filled in by rack_log_output */
uint64_t send_time;
uint32_t sseq;
uint32_t eseq;
/* Ack's fill in the rest of the data */
uint16_t cnt;
/* Maximum acks present */
uint16_t cnt_alloc;
};
#define RACK_DEFAULT_PCM_ARRAY 16
struct rack_pcm_stats {
uint32_t sseq;
uint32_t eseq;
uint64_t ack_time;
};
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
struct tailq_hash *tqh; /* Tree of all segments Lock(a) */
@ -402,6 +436,7 @@ struct rack_control {
uint32_t rc_rcvtime; /* When we last received data */
uint32_t rc_num_split_allocs; /* num split map entries allocated */
uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */
uint32_t rack_avg_rec_sends;
uint32_t rc_last_output_to;
uint32_t rc_went_idle_time;
@ -452,19 +487,45 @@ struct rack_control {
struct tcp_sendfile_track *rc_last_sft;
uint32_t lt_seq; /* Seq at start of lt_bw gauge */
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
uint64_t last_sndbytes;
uint64_t last_snd_rxt_bytes;
uint64_t rxt_threshold;
uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */
uint32_t last_rnd_rxt_clamped;
uint32_t num_of_clamps_applied;
uint32_t clamp_options;
uint32_t max_clamps;
/* Recovery stats */
uint64_t time_entered_recovery;
uint64_t bytes_acked_in_recovery;
/* Policer Detection */
uint64_t last_policer_sndbytes;
uint64_t last_policer_snd_rxt_bytes;
uint64_t policer_bw;
uint64_t last_sendtime;
uint64_t last_gpest;
uint64_t last_tm_mark; /* Last tm mark used */
uint64_t fillcw_cap; /* B/W cap on fill cw */
struct rack_pcm_info pcm_i;
struct rack_pcm_stats *pcm_s;
uint32_t gp_gain_req; /* Percent off gp gain req */
uint32_t last_rnd_of_gp_rise;
uint32_t gp_rnd_thresh;
uint32_t ss_hi_fs;
uint32_t gate_to_fs;
uint32_t policer_max_seg;
uint32_t pol_bw_comp;
uint16_t policer_rxt_threshold;
uint8_t policer_avg_threshold;
uint8_t policer_med_threshold;
uint32_t pcm_max_seg;
uint32_t last_pcm_round;
uint32_t pcm_idle_rounds;
uint32_t current_policer_bucket;
uint32_t policer_bucket_size;
uint32_t idle_snd_una;
uint32_t ack_for_idle;
uint32_t last_amount_before_rec;
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */
uint32_t rc_loss_at_start; /* At measurement window where was our lost value */
uint32_t rc_considered_lost; /* Count in recovery of non-retransmitted bytes considered lost */
uint32_t dsack_round_end; /* In a round of seeing a DSACK */
uint32_t current_round; /* Starting at zero */
@ -491,6 +552,8 @@ struct rack_control {
uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occurred what was snd-max */
uint32_t rc_out_at_rto;
int32_t rc_scw_index;
uint32_t max_reduction;
uint32_t side_chan_dis_mask; /* Bit mask of socket opt's disabled */
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint32_t rc_last_timeout_snduna;
uint32_t last_tlp_acked_start;
@ -503,7 +566,11 @@ struct rack_control {
uint32_t ack_during_sd;
uint32_t input_pkt;
uint32_t saved_input_pkt;
uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */
uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */
uint32_t cleared_app_ack_seq;
uint32_t last_rcv_tstmp_for_rtt;
uint32_t last_time_of_arm_rcv;
uint32_t rto_ssthresh;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@ -516,10 +583,13 @@ struct rack_control {
* we also set the flag (if ecn_beta is set) to make
* new_reno do less of a backoff for ecn (think abe).
*/
uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE];
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
uint8_t policer_del_mss; /* How many mss during recovery for policer detection */
uint8_t rack_per_upper_bound_ss;
uint8_t rack_per_upper_bound_ca;
uint8_t cleared_app_ack;
uint8_t dsack_persist;
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
@ -528,17 +598,19 @@ struct rack_control {
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */
uint8_t policer_alt_median; /* Alternate median for policer detection */
uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */
uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */
uint8_t pacing_discount_amm; /*
* This is a multipler to the base discount that
* can be used to increase the discount.
*/
uint8_t use_gp_not_last;
uint8_t pacing_method; /* If pace_always, what type of pacing */
uint8_t already_had_a_excess;
};
#endif
#define RACK_PACING_NONE 0x00
#define RACK_DGP_PACING 0x01
#define RACK_REG_PACING 0x02
/* DGP with no buffer level mitigations */
#define DGP_LEVEL0 0
@ -578,6 +650,10 @@ struct rack_control {
#define HYBRID_LOG_EXTEND 14 /* We extended the end */
#define HYBRID_LOG_SENT_LOST 15 /* A closing sent/lost report */
#define LOST_ZERO 1 /* Zero it out */
#define LOST_ADD 2 /* Add to it */
#define LOST_SUB 3 /* Sub from it */
#define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */
#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */
@ -590,6 +666,7 @@ struct rack_control {
*/
#define MAX_USER_SET_SEG 0x3f /* The max we can set is 63 which is probably too many */
#define RACK_FREE_CNT_MAX 0x2f /* Max our counter can do */
#ifdef _KERNEL
@ -601,8 +678,9 @@ struct tcp_rack {
int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
struct inpcb *rc_inp; /* The inpcb Lock(a) */
uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
uint8_t rc_free_cnt : 6,
rc_skip_timely : 1,
pcm_enabled : 1; /* Is PCM enabled */
uint8_t client_bufferlvl : 3, /* Expected range [0,5]: 0=unset, 1=low/empty */
rack_deferred_inited : 1,
/* ******************************************************************** */
@ -612,11 +690,11 @@ struct tcp_rack {
shape_rxt_to_pacing_min : 1,
/* ******************************************************************** */
rc_ack_required: 1,
r_pacing_discount : 1;
r_use_hpts_min : 1;
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
excess_rxt_on: 1, /* Are actions on for excess retransmissions? */
dis_lt_bw : 1,
rc_ack_can_sendout_data: 1, /*
* If set it will override pacing restrictions on not sending
* data when the pacing timer is running. I.e. you set this
@ -659,7 +737,7 @@ struct tcp_rack {
r_rack_hw_rate_caps: 1,
r_up_only: 1,
r_via_fill_cw : 1,
r_fill_less_agg : 1;
r_rcvpath_rtt_up : 1;
uint8_t rc_user_set_max_segs : 7, /* Socket option value Lock(a) */
rc_fillcw_apply_discount;
@ -673,7 +751,7 @@ struct tcp_rack {
rc_highly_buffered: 1, /* The path is highly buffered */
rc_dragged_bottom: 1,
rc_pace_dnd : 1, /* The pace do not disturb bit */
rc_avali2 : 1,
rc_initial_ss_comp : 1,
rc_gp_filled : 1,
rc_hw_nobuf : 1;
uint8_t r_state : 4, /* Current rack state Lock(a) */
@ -696,8 +774,8 @@ struct tcp_rack {
uint8_t app_limited_needs_set : 1,
use_fixed_rate : 1,
rc_has_collapsed : 1,
r_cwnd_was_clamped : 1,
r_clamped_gets_lower : 1,
use_lesser_lt_bw : 1,
cspr_is_fcc : 1,
rack_hdrw_pacing : 1, /* We are doing Hardware pacing */
rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */
rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */
@ -722,7 +800,14 @@ struct tcp_rack {
r_persist_lt_bw_off : 1,
r_collapse_point_valid : 1,
dgp_on : 1;
uint16_t rc_init_win : 8,
uint16_t rto_from_rec: 1,
avail_bit: 1,
pcm_in_progress: 1,
pcm_needed: 1,
policer_detect_on: 1, /* Are we detecting policers? */
rc_policer_detected : 1, /* We are beiing policed */
rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */
rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,
rc_gp_saw_rec : 1,
@ -735,5 +820,9 @@ struct tcp_rack {
struct rack_control r_ctl;
} __aligned(CACHE_LINE_SIZE);
void rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack,
uint32_t ss, uint32_t es);
#endif
#endif

View file

@ -287,18 +287,29 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_R
static volatile uint32_t number_of_tcp_connections_pacing = 0;
static uint32_t shadow_num_connections = 0;
static counter_u64_t tcp_pacing_failures;
static counter_u64_t tcp_dgp_failures;
static uint32_t shadow_tcp_pacing_dgp = 0;
static volatile uint32_t number_of_dgp_connections = 0;
static int tcp_pacing_limit = 10000;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
&tcp_pacing_limit, 1000,
"If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
static int tcp_dgp_limit = -1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW,
&tcp_dgp_limit, -1,
"If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)");
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
&shadow_num_connections, 0, "Number of TCP connections being paced");
SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD,
&tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit");
SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD,
&tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit");
static int tcp_log_debug = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
@ -1571,6 +1582,7 @@ tcp_init(void *arg __unused)
tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
tcp_bad_csums = counter_u64_alloc(M_WAITOK);
tcp_pacing_failures = counter_u64_alloc(M_WAITOK);
tcp_dgp_failures = counter_u64_alloc(M_WAITOK);
#ifdef TCPPCAP
tcp_pcap_init();
#endif
@ -4022,6 +4034,43 @@ tcp_can_enable_pacing(void)
}
}
int
tcp_incr_dgp_pacing_cnt(void)
{
if ((tcp_dgp_limit == -1) ||
(tcp_dgp_limit > number_of_dgp_connections)) {
atomic_fetchadd_int(&number_of_dgp_connections, 1);
shadow_tcp_pacing_dgp = number_of_dgp_connections;
return (1);
} else {
counter_u64_add(tcp_dgp_failures, 1);
return (0);
}
}
static uint8_t tcp_dgp_warning = 0;
void
tcp_dec_dgp_pacing_cnt(void)
{
uint32_t ret;
ret = atomic_fetchadd_int(&number_of_dgp_connections, -1);
shadow_tcp_pacing_dgp = number_of_dgp_connections;
KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?"));
if (ret == 0) {
if (tcp_dgp_limit != -1) {
printf("Warning all DGP is now disabled, count decrements invalidly!\n");
tcp_dgp_limit = 0;
tcp_dgp_warning = 1;
} else if (tcp_dgp_warning == 0) {
printf("Warning DGP pacing is invalid, invalid decrement\n");
tcp_dgp_warning = 1;
}
}
}
static uint8_t tcp_pacing_warning = 0;
void
@ -4541,7 +4590,7 @@ tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, i
if (tp->t_tcpreq_req) {
for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) {
fil = &tp->t_tcpreq_info[i];
if (fil->flags != TCP_TRK_TRACK_FLG_USED)
if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0)
continue;
if ((fil->timestamp == req->timestamp) &&
(fil->start == req->start) &&
@ -4573,6 +4622,7 @@ tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, i
allocated = 1;
fil->flags = TCP_TRK_TRACK_FLG_USED;
fil->timestamp = req->timestamp;
fil->playout_ms = req->playout_ms;
fil->localtime = ts;
fil->start = req->start;
if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
@ -4589,7 +4639,10 @@ tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, i
fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
fil->start_seq = tp->snd_una +
tptosocket(tp)->so_snd.sb_ccc;
fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
if (req->flags & TCP_LOG_HTTPD_RANGE_END)
fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
else
fil->end_seq = 0;
if (tptosocket(tp)->so_snd.sb_tls_info) {
/*
* This session is doing TLS. Take a swag guess

View file

@ -1032,7 +1032,10 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
if (!solisten_enqueue(so, SS_ISCONNECTED))
tp->t_flags |= TF_SONOTCONN;
/* Can we inherit anything from the listener? */
if (tp->t_fb->tfb_inherit != NULL) {
(*tp->t_fb->tfb_inherit)(tp, sotoinpcb(lso));
}
return (so);
allocfail:

View file

@ -179,6 +179,12 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
goto out;
}
tp->t_state = TCPS_CLOSED;
/* Can we inherit anything from the listener? */
if ((so->so_listen != NULL) &&
(so->so_listen->so_pcb != NULL) &&
(tp->t_fb->tfb_inherit != NULL)) {
(*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen));
}
tcp_bblog_pru(tp, PRU_ATTACH, error);
INP_WUNLOCK(inp);
TCPSTATES_INC(TCPS_CLOSED);
@ -1601,6 +1607,7 @@ tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti)
ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
ti->tcpi_rcv_adv = tp->rcv_adv;
ti->tcpi_dupacks = tp->t_dupacks;
ti->tcpi_rttmin = tp->t_rttlow;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
ti->tcpi_options |= TCPI_OPT_TOE;

View file

@ -138,7 +138,8 @@ STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
#define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
#define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
#define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
#define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */
#define MAX_TCP_TRK_REQ 5 /* Max we will have at once */
struct tcp_sendfile_track {
@ -151,11 +152,14 @@ struct tcp_sendfile_track {
uint64_t cspr; /* Client suggested pace rate */
uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
uint64_t sent_at_ls; /* Sent value at the last send */
uint64_t rxt_at_ls; /* Retransmit value at the last send */
tcp_seq start_seq; /* First TCP Seq assigned */
tcp_seq end_seq; /* If range req last seq */
uint32_t flags; /* Type of request open etc */
uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
uint32_t hint_maxseg; /* Client hinted maxseg */
uint32_t playout_ms; /* Client playout ms */
uint32_t hybrid_flags; /* Hybrid flags on this request */
};
@ -623,6 +627,8 @@ struct tcp_function_block {
void (*tfb_switch_failed)(struct tcpcb *);
bool (*tfb_early_wake_check)(struct tcpcb *);
int (*tfb_compute_pipe)(struct tcpcb *tp);
int (*tfb_stack_info)(struct tcpcb *tp, struct stack_specific_info *);
void (*tfb_inherit)(struct tcpcb *tp, struct inpcb *h_inp);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@ -788,7 +794,7 @@ tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack)
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_CLOSED 0x04000000 /* close(2) called on socket */
#define TF_UNUSED1 0x08000000 /* unused */
#define TF_SENTSYN 0x08000000 /* At least one syn has been sent */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
@ -1501,6 +1507,8 @@ void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
size_t seed_len);
int tcp_can_enable_pacing(void);
int tcp_incr_dgp_pacing_cnt(void);
void tcp_dec_dgp_pacing_cnt(void);
void tcp_decrement_paced_conn(void);
void tcp_change_time_units(struct tcpcb *, int);
void tcp_handle_orphaned_packets(struct tcpcb *);