tcp: use single locked callout per tcpcb for the TCP timers

Use only one callout structure per tcpcb that is responsible for handling
all five TCP timeouts.  Use locked version of callout, of course. The
callout function tcp_timer_enter() chooses soonest timer and executes it
with lock held.  Unless the timer reports that the tcpcb has been freed,
the callout is rescheduled for next soonest timer, if there is any.

With single callout per tcpcb on connection teardown we should be able
to fully stop the callout and immediately free it, avoiding use of
callout_async_drain().  There is one gotcha here: callout_stop() can
actually touch our memory when a rare race condition happens.  See
comment above tcp_timer_stop().  Synchronous stop of the callout makes
tcp_discardcb() the single entry point for tcpcb destructor, merging the
tcp_freecb() to the end of the function.

While here, also remove lots of lingering checks in the beginning of
TCP timer functions.  With a locked callout they are unnecessary.

While here, clean unused parts of timer KPI for the pluggable TCP stacks.

While here, remove TCPDEBUG from tcp_timer.c, as this allows for more
simplification of TCP timers.  The TCPDEBUG is scheduled for removal.

Move the DTrace probes in timers to the beginning of a function, where
a tcpcb is always existing.

Discussed with:		rrs, tuexen, rscheff	(the TCP part of the diff)
Reviewed by:		hselasky, kib, mav	(the callout part)
Differential revision:	https://reviews.freebsd.org/D37321
This commit is contained in:
Gleb Smirnoff 2022-12-07 09:00:48 -08:00
parent 918fa4227d
commit 446ccdd08e
8 changed files with 257 additions and 534 deletions

View file

@ -5285,37 +5285,13 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
}
}
static void
bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
static int
bbr_stopall(struct tcpcb *tp)
{
struct tcp_bbr *bbr;
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
bbr->rc_all_timers_stopped = 1;
return;
}
/*
* stop all timers always returning 0.
*/
static int
bbr_stopall(struct tcpcb *tp)
{
return (0);
}
static void
bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
{
return;
}
/*
* return true if a bbr timer (rack or tlp) is active.
*/
static int
bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
{
return (0);
}
@ -14168,9 +14144,6 @@ struct tcp_function_block __tcp_bbr = {
.tfb_tcp_fb_init = bbr_init,
.tfb_tcp_fb_fini = bbr_fini,
.tfb_tcp_timer_stop_all = bbr_stopall,
.tfb_tcp_timer_activate = bbr_timer_activate,
.tfb_tcp_timer_active = bbr_timer_active,
.tfb_tcp_timer_stop = bbr_timer_stop,
.tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
.tfb_tcp_handoff_ok = bbr_handoff_ok,
.tfb_tcp_mtu_chg = bbr_mtu_chg,

View file

@ -489,10 +489,6 @@ static void rack_remxt_tmr(struct tcpcb *tp);
static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt);
static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
static int32_t rack_stopall(struct tcpcb *tp);
static void
rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
uint32_t delta);
static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
static uint32_t
@ -5910,9 +5906,6 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
*/
struct rack_sendmap *rsm;
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
counter_u64_add(rack_to_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
@ -6123,9 +6116,6 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
uint32_t out, avail;
int collapsed_win = 0;
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
/* Its not time yet */
return (0);
@ -6312,9 +6302,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
static int
rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
tp->t_flags &= ~TF_DELACK;
tp->t_flags |= TF_ACKNOW;
@ -6337,9 +6325,6 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
struct tcptemp *t_template;
int32_t retval = 1;
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
if (rack->rc_in_persist == 0)
return (0);
if (ctf_progress_timeout_check(tp, false)) {
@ -6425,9 +6410,6 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
struct tcptemp *t_template;
struct inpcb *inp = tptoinpcb(tp);
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
/*
@ -6654,9 +6636,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
int32_t retval = 0;
bool isipv6;
if (tp->tt_flags & TT_STOPPED) {
return (1);
}
if ((tp->t_flags & TF_GPUTINPROG) &&
(tp->t_rxtshift)) {
/*
@ -7060,12 +7039,6 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin
rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
}
static void
rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
{
return;
}
static int
rack_stopall(struct tcpcb *tp)
{
@ -7075,18 +7048,6 @@ rack_stopall(struct tcpcb *tp)
return (0);
}
static void
rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
{
return;
}
static int
rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
{
return (0);
}
static void
rack_stop_all_timers(struct tcpcb *tp)
{
@ -20307,9 +20268,6 @@ static struct tcp_function_block __tcp_rack = {
.tfb_tcp_fb_init = rack_init,
.tfb_tcp_fb_fini = rack_fini,
.tfb_tcp_timer_stop_all = rack_stopall,
.tfb_tcp_timer_activate = rack_timer_activate,
.tfb_tcp_timer_active = rack_timer_active,
.tfb_tcp_timer_stop = rack_timer_stop,
.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
.tfb_tcp_handoff_ok = rack_handoff_ok,
.tfb_tcp_mtu_chg = rack_mtu_change,

View file

@ -1194,22 +1194,6 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
*num_names = 0;
return (EINVAL);
}
if (blk->tfb_tcp_timer_stop_all ||
blk->tfb_tcp_timer_activate ||
blk->tfb_tcp_timer_active ||
blk->tfb_tcp_timer_stop) {
/*
* If you define one timer function you
* must have them all.
*/
if ((blk->tfb_tcp_timer_stop_all == NULL) ||
(blk->tfb_tcp_timer_activate == NULL) ||
(blk->tfb_tcp_timer_active == NULL) ||
(blk->tfb_tcp_timer_stop == NULL)) {
*num_names = 0;
return (EINVAL);
}
}
if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
*num_names = 0;
@ -2227,12 +2211,9 @@ tcp_newtcpcb(struct inpcb *inp)
#endif /* INET6 */
V_tcp_mssdflt;
/* Set up our timeouts. */
callout_init(&tp->tt_rexmt, 1);
callout_init(&tp->tt_persist, 1);
callout_init(&tp->tt_keep, 1);
callout_init(&tp->tt_2msl, 1);
callout_init(&tp->tt_delack, 1);
callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
for (int i = 0; i < TT_N; i++)
tp->t_timers[i] = SBT_MAX;
switch (V_tcp_do_rfc1323) {
case 0:
@ -2301,13 +2282,6 @@ tcp_newtcpcb(struct inpcb *inp)
if (V_tcp_do_lrd)
tp->t_flags |= TF_LRD;
/*
* XXXGL: this self-reference might be pointless. It will go away
* when the TCP timers are properly locked and could never fire after
* tcp_discardcb().
*/
in_pcbref(inp);
return (tp);
}
@ -2341,32 +2315,15 @@ void
tcp_discardcb(struct tcpcb *tp)
{
struct inpcb *inp = tptoinpcb(tp);
struct socket *so = tptosocket(tp);
#ifdef INET6
bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif
INP_WLOCK_ASSERT(inp);
/*
* Make sure that all of our timers are stopped before we delete the
* PCB.
*
* If stopping a timer fails, we schedule a discard function in same
* callout, and the last discard function called will take care of
* deleting the tcpcb.
*/
tp->tt_draincnt = 0;
tcp_timer_stop(tp, TT_REXMT);
tcp_timer_stop(tp, TT_PERSIST);
tcp_timer_stop(tp, TT_KEEP);
tcp_timer_stop(tp, TT_2MSL);
tcp_timer_stop(tp, TT_DELACK);
tcp_timer_stop(tp);
if (tp->t_fb->tfb_tcp_timer_stop_all) {
/*
* Call the stop-all function of the methods,
* this function should call the tcp_timer_stop()
* method with each of the function specific timeouts.
* That stop will be called via the tfb_tcp_timer_stop()
* which should use the async drain function of the
* callout system (see tcp_var.h).
*/
tp->t_fb->tfb_tcp_timer_stop_all(tp);
}
@ -2402,23 +2359,7 @@ tcp_discardcb(struct tcpcb *tp)
#endif
CC_ALGO(tp) = NULL;
if (tp->tt_draincnt == 0)
tcp_freecb(tp);
}
bool
tcp_freecb(struct tcpcb *tp)
{
struct inpcb *inp = tptoinpcb(tp);
struct socket *so = tptosocket(tp);
#ifdef INET6
bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif
INP_WLOCK_ASSERT(inp);
MPASS(tp->tt_draincnt == 0);
/* We own the last reference on tcpcb, let's free it. */
#ifdef TCP_BLACKBOX
tcp_log_tcpcbfini(tp);
#endif
@ -2489,8 +2430,6 @@ tcp_freecb(struct tcpcb *tp)
}
refcount_release(&tp->t_fb->tfb_refcnt);
return (in_pcbrele_wlocked(inp));
}
/*
@ -3940,17 +3879,17 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
(tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
now = getsbinuptime();
#define COPYTIMER(ttt) do { \
if (callout_active(&tp->ttt)) \
xt->ttt = (tp->ttt.c_time - now) / SBT_1MS; \
else \
xt->ttt = 0; \
#define COPYTIMER(which,where) do { \
if (tp->t_timers[which] != SBT_MAX) \
xt->where = (tp->t_timers[which] - now) / SBT_1MS; \
else \
xt->where = 0; \
} while (0)
COPYTIMER(tt_delack);
COPYTIMER(tt_rexmt);
COPYTIMER(tt_persist);
COPYTIMER(tt_keep);
COPYTIMER(tt_2msl);
COPYTIMER(TT_DELACK, tt_delack);
COPYTIMER(TT_REXMT, tt_rexmt);
COPYTIMER(TT_PERSIST, tt_persist);
COPYTIMER(TT_KEEP, tt_keep);
COPYTIMER(TT_2MSL, tt_2msl);
#undef COPYTIMER
xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;

View file

@ -243,104 +243,86 @@ int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
/*
* TCP timer processing.
*
* Each connection has 5 timers associated with it, which can be scheduled
* simultaneously. They all are serviced by one callout tcp_timer_enter().
* This function executes the next timer via tcp_timersw[] vector. Each
* timer is supposed to return 'true' unless the connection was destroyed.
* In the former case tcp_timer_enter() will schedule callout for next timer.
*/
void
tcp_timer_delack(void *xtp)
typedef bool tcp_timer_t(struct tcpcb *);
static tcp_timer_t tcp_timer_delack;
static tcp_timer_t tcp_timer_2msl;
static tcp_timer_t tcp_timer_keep;
static tcp_timer_t tcp_timer_persist;
static tcp_timer_t tcp_timer_rexmt;
static tcp_timer_t * const tcp_timersw[TT_N] = {
[TT_DELACK] = tcp_timer_delack,
[TT_REXMT] = tcp_timer_rexmt,
[TT_PERSIST] = tcp_timer_persist,
[TT_KEEP] = tcp_timer_keep,
[TT_2MSL] = tcp_timer_2msl,
};
/*
* tcp_output_locked() s a timer specific variation of call to tcp_output(),
* see tcp_var.h for the rest. It handles drop request from advanced stacks,
* but keeps tcpcb locked unless tcp_drop() destroyed it.
* Returns true if tcpcb is valid and locked.
*/
static inline bool
tcp_output_locked(struct tcpcb *tp)
{
int rv;
INP_WLOCK_ASSERT(tptoinpcb(tp));
if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) {
KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
("TCP stack %s requested tcp_drop(%p)",
tp->t_fb->tfb_tcp_block_name, tp));
tp = tcp_drop(tp, rv);
}
return (tp != NULL);
}
static bool
tcp_timer_delack(struct tcpcb *tp)
{
struct epoch_tracker et;
struct tcpcb *tp = xtp;
#if defined(INVARIANTS) || defined(VIMAGE)
struct inpcb *inp = tptoinpcb(tp);
#endif
bool rv;
INP_WLOCK_ASSERT(inp);
INP_WLOCK(inp);
CURVNET_SET(inp->inp_vnet);
if (callout_pending(&tp->tt_delack) ||
!callout_active(&tp->tt_delack)) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->tt_delack);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
NET_EPOCH_ENTER(et);
(void) tcp_output_unlock(tp);
rv = tcp_output_locked(tp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (rv);
}
/*
* Call tcp_close() from a callout context.
*/
static void
tcp_timer_close(struct tcpcb *tp)
static bool
tcp_timer_2msl(struct tcpcb *tp)
{
struct epoch_tracker et;
struct inpcb *inp = tptoinpcb(tp);
bool close = false;
INP_WLOCK_ASSERT(inp);
NET_EPOCH_ENTER(et);
tp = tcp_close(tp);
NET_EPOCH_EXIT(et);
if (tp != NULL)
INP_WUNLOCK(inp);
}
/*
* Call tcp_drop() from a callout context.
*/
static void
tcp_timer_drop(struct tcpcb *tp)
{
struct epoch_tracker et;
struct inpcb *inp = tptoinpcb(tp);
INP_WLOCK_ASSERT(inp);
NET_EPOCH_ENTER(et);
tp = tcp_drop(tp, ETIMEDOUT);
NET_EPOCH_EXIT(et);
if (tp != NULL)
INP_WUNLOCK(inp);
}
void
tcp_timer_2msl(void *xtp)
{
struct tcpcb *tp = xtp;
struct inpcb *inp = tptoinpcb(tp);
#ifdef TCPDEBUG
int ostate;
ostate = tp->t_state;
#endif
INP_WLOCK(inp);
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
CURVNET_SET(inp->inp_vnet);
tcp_log_end_status(tp, TCP_EI_STATUS_2MSL);
tcp_free_sackholes(tp);
if (callout_pending(&tp->tt_2msl) ||
!callout_active(&tp->tt_2msl)) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->tt_2msl);
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
KASSERT((tp->tt_flags & TT_STOPPED) == 0,
("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* 2 MSL timeout in shutdown went off. If we're closed but
* still waiting for peer to close and connection has been idle
@ -354,69 +336,41 @@ tcp_timer_2msl(void *xtp)
* XXXGL: check if inp_socket shall always be !NULL here?
*/
if (tp->t_state == TCPS_TIME_WAIT) {
tcp_timer_close(tp);
CURVNET_RESTORE();
return;
close = true;
} else if (tp->t_state == TCPS_FIN_WAIT_2 &&
tcp_fast_finwait2_recycle && inp->inp_socket &&
(inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
TCPSTAT_INC(tcps_finwait2_drops);
tcp_timer_close(tp);
CURVNET_RESTORE();
return;
close = true;
} else {
if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
callout_reset(&tp->tt_2msl,
TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
} else {
tcp_timer_close(tp);
CURVNET_RESTORE();
return;
}
if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp));
else
close = true;
}
if (close) {
struct epoch_tracker et;
#ifdef TCPDEBUG
if (tptosocket(tp)->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
INP_WUNLOCK(inp);
NET_EPOCH_ENTER(et);
tp = tcp_close(tp);
NET_EPOCH_EXIT(et);
}
CURVNET_RESTORE();
return (tp != NULL);
}
void
tcp_timer_keep(void *xtp)
static bool
tcp_timer_keep(struct tcpcb *tp)
{
struct epoch_tracker et;
struct tcpcb *tp = xtp;
struct inpcb *inp = tptoinpcb(tp);
struct tcptemp *t_template;
#ifdef TCPDEBUG
int ostate;
ostate = tp->t_state;
#endif
INP_WLOCK_ASSERT(inp);
INP_WLOCK(inp);
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
CURVNET_SET(inp->inp_vnet);
if (callout_pending(&tp->tt_keep) ||
!callout_active(&tp->tt_keep)) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->tt_keep);
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
KASSERT((tp->tt_flags & TT_STOPPED) == 0,
("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* Because we don't regularly reset the keepalive callout in
* the ESTABLISHED state, it may be that we don't actually need
@ -428,11 +382,10 @@ tcp_timer_keep(void *xtp)
idletime = ticks - tp->t_rcvtime;
if (idletime < TP_KEEPIDLE(tp)) {
callout_reset(&tp->tt_keep,
TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
INP_WUNLOCK(inp);
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp) - idletime);
CURVNET_RESTORE();
return;
return (true);
}
}
@ -470,38 +423,22 @@ tcp_timer_keep(void *xtp)
NET_EPOCH_EXIT(et);
free(t_template, M_TEMP);
}
callout_reset(&tp->tt_keep, TP_KEEPINTVL(tp),
tcp_timer_keep, tp);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp));
} else
callout_reset(&tp->tt_keep, TP_KEEPIDLE(tp),
tcp_timer_keep, tp);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
#ifdef TCPDEBUG
if (inp->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
return (true);
dropit:
TCPSTAT_INC(tcps_keepdrops);
NET_EPOCH_ENTER(et);
tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
tp = tcp_drop(tp, ETIMEDOUT);
#ifdef TCPDEBUG
if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG))
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
NET_EPOCH_EXIT(et);
if (tp != NULL)
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return (tp != NULL);
}
/*
@ -529,37 +466,19 @@ tcp_maxunacktime_check(struct tcpcb *tp)
return true;
}
void
tcp_timer_persist(void *xtp)
static bool
tcp_timer_persist(struct tcpcb *tp)
{
struct epoch_tracker et;
struct tcpcb *tp = xtp;
#if defined(INVARIANTS) || defined(VIMAGE)
struct inpcb *inp = tptoinpcb(tp);
bool progdrop;
int outrv;
#ifdef TCPDEBUG
int ostate;
ostate = tp->t_state;
#endif
bool progdrop, rv;
INP_WLOCK(inp);
INP_WLOCK_ASSERT(inp);
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
CURVNET_SET(inp->inp_vnet);
if (callout_pending(&tp->tt_persist) ||
!callout_active(&tp->tt_persist)) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->tt_persist);
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
KASSERT((tp->tt_flags & TT_STOPPED) == 0,
("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* Persistence timer into zero window.
* Force a byte to be output, if possible.
@ -581,9 +500,7 @@ tcp_timer_persist(void *xtp)
if (!progdrop)
TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_timer_drop(tp);
CURVNET_RESTORE();
return;
goto dropit;
}
/*
* If the user has closed the socket then drop a persisting
@ -593,57 +510,39 @@ tcp_timer_persist(void *xtp)
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_timer_drop(tp);
CURVNET_RESTORE();
return;
goto dropit;
}
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
NET_EPOCH_ENTER(et);
outrv = tcp_output_nodrop(tp);
tp->t_flags &= ~TF_FORCEDATA;
#ifdef TCPDEBUG
if (tp != NULL && tptosocket(tp)->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
(void) tcp_unlock_or_drop(tp, outrv);
if ((rv = tcp_output_locked(tp)))
tp->t_flags &= ~TF_FORCEDATA;
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (rv);
dropit:
NET_EPOCH_ENTER(et);
tp = tcp_drop(tp, ETIMEDOUT);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (tp != NULL);
}
void
tcp_timer_rexmt(void * xtp)
static bool
tcp_timer_rexmt(struct tcpcb *tp)
{
struct epoch_tracker et;
struct tcpcb *tp = xtp;
struct inpcb *inp = tptoinpcb(tp);
int rexmt, outrv;
bool isipv6;
#ifdef TCPDEBUG
int ostate;
int rexmt;
bool isipv6, rv;
ostate = tp->t_state;
#endif
INP_WLOCK_ASSERT(inp);
INP_WLOCK(inp);
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
CURVNET_SET(inp->inp_vnet);
if (callout_pending(&tp->tt_rexmt) ||
!callout_active(&tp->tt_rexmt)) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->tt_rexmt);
if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
CURVNET_RESTORE();
return;
}
KASSERT((tp->tt_flags & TT_STOPPED) == 0,
("%s: tp %p tcpcb can't be stopped here", __func__, tp));
tcp_free_sackholes(tp);
TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
if (tp->t_fb->tfb_tcp_rexmit_tmr) {
@ -664,9 +563,12 @@ tcp_timer_rexmt(void * xtp)
TCPSTAT_INC(tcps_timeoutdrop);
tp->t_rxtshift = TCP_MAXRXTSHIFT;
tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
tcp_timer_drop(tp);
NET_EPOCH_ENTER(et);
tp = tcp_drop(tp, ETIMEDOUT);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return;
return (tp != NULL);
}
if (tp->t_state == TCPS_SYN_SENT) {
/*
@ -883,159 +785,131 @@ tcp_timer_rexmt(void * xtp)
cc_cong_signal(tp, NULL, CC_RTO);
NET_EPOCH_ENTER(et);
outrv = tcp_output_nodrop(tp);
#ifdef TCPDEBUG
if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG))
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
(void) tcp_unlock_or_drop(tp, outrv);
rv = tcp_output_locked(tp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
return (rv);
}
void
tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
static inline tt_which
tcp_timer_next(struct tcpcb *tp, sbintime_t *precision)
{
struct callout *t_callout;
callout_func_t *f_callout;
tt_which i, rv;
sbintime_t after, before;
for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) {
if (tp->t_timers[i] < after) {
after = tp->t_timers[i];
rv = i;
}
before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]);
}
if (precision != NULL)
*precision = before - after;
return (rv);
}
static void
tcp_timer_enter(void *xtp)
{
struct tcpcb *tp = xtp;
struct inpcb *inp = tptoinpcb(tp);
int cpu = inp_to_cpuid(inp);
sbintime_t precision;
tt_which which;
INP_WLOCK_ASSERT(inp);
MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0);
curthread->td_pflags |= TDP_INTCPCALLOUT;
which = tcp_timer_next(tp, NULL);
MPASS(which < TT_N);
tp->t_timers[which] = SBT_MAX;
tp->t_precisions[which] = 0;
if (tcp_timersw[which](tp)) {
if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
callout_reset_sbt_on(&tp->t_callout,
tp->t_timers[which], precision, tcp_timer_enter,
tp, inp_to_cpuid(inp), C_ABSOLUTE);
}
INP_WUNLOCK(inp);
}
curthread->td_pflags &= ~TDP_INTCPCALLOUT;
}
/*
* Activate or stop (delta == 0) a TCP timer.
*/
void
tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta)
{
struct inpcb *inp = tptoinpcb(tp);
sbintime_t precision;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
#endif
if (tp->tt_flags & TT_STOPPED)
return;
INP_WLOCK_ASSERT(inp);
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->tt_delack;
f_callout = tcp_timer_delack;
break;
case TT_REXMT:
t_callout = &tp->tt_rexmt;
f_callout = tcp_timer_rexmt;
break;
case TT_PERSIST:
t_callout = &tp->tt_persist;
f_callout = tcp_timer_persist;
break;
case TT_KEEP:
t_callout = &tp->tt_keep;
f_callout = tcp_timer_keep;
break;
case TT_2MSL:
t_callout = &tp->tt_2msl;
f_callout = tcp_timer_2msl;
break;
default:
if (tp->t_fb->tfb_tcp_timer_activate) {
tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
return;
}
panic("tp %p bad timer_type %#x", tp, timer_type);
}
if (delta == 0) {
callout_stop(t_callout);
} else {
callout_reset_on(t_callout, delta, f_callout, tp, cpu);
}
if (delta > 0)
callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
&tp->t_timers[which], &tp->t_precisions[which]);
else
tp->t_timers[which] = SBT_MAX;
if ((which = tcp_timer_next(tp, &precision)) != TT_N)
callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which],
precision, tcp_timer_enter, tp, inp_to_cpuid(inp),
C_ABSOLUTE);
else
callout_stop(&tp->t_callout);
}
int
tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
bool
tcp_timer_active(struct tcpcb *tp, tt_which which)
{
struct callout *t_callout;
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->tt_delack;
break;
case TT_REXMT:
t_callout = &tp->tt_rexmt;
break;
case TT_PERSIST:
t_callout = &tp->tt_persist;
break;
case TT_KEEP:
t_callout = &tp->tt_keep;
break;
case TT_2MSL:
t_callout = &tp->tt_2msl;
break;
default:
if (tp->t_fb->tfb_tcp_timer_active) {
return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
}
panic("tp %p bad timer_type %#x", tp, timer_type);
}
return callout_active(t_callout);
INP_WLOCK_ASSERT(tptoinpcb(tp));
return (tp->t_timers[which] != SBT_MAX);
}
static void
tcp_timer_discard(void *ptp)
/*
* Stop all timers associated with tcpcb.
*
* Called only on tcpcb destruction. The tcpcb shall already be dropped from
* the pcb lookup database and socket is not losing the last reference.
*
* XXXGL: unfortunately our callout(9) is not able to fully stop a locked
* callout even when only two threads are involved: the callout itself and the
* thread that does callout_stop(). See where softclock_call_cc() swaps the
* callwheel lock to callout lock and then checks cc_exec_cancel(). This is
* the race window. If it happens, the tcp_timer_enter() won't be executed,
* however pcb lock will be locked and released, hence we can't free memory.
* Until callout(9) is improved, just keep retrying. In my profiling I've seen
* such event happening less than 1 time per hour with 20-30 Gbit/s of traffic.
*/
void
tcp_timer_stop(struct tcpcb *tp)
{
struct epoch_tracker et;
struct tcpcb *tp = (struct tcpcb *)ptp;
struct inpcb *inp = tptoinpcb(tp);
INP_WLOCK(inp);
CURVNET_SET(inp->inp_vnet);
NET_EPOCH_ENTER(et);
INP_WLOCK_ASSERT(inp);
KASSERT((tp->tt_flags & TT_STOPPED) != 0,
("%s: tcpcb has to be stopped here", __func__));
if (--tp->tt_draincnt > 0 ||
tcp_freecb(tp) == false)
if (curthread->td_pflags & TDP_INTCPCALLOUT) {
int stopped __diagused;
stopped = callout_stop(&tp->t_callout);
MPASS(stopped == 0);
} else while(__predict_false(callout_stop(&tp->t_callout) == 0)) {
INP_WUNLOCK(inp);
NET_EPOCH_EXIT(et);
CURVNET_RESTORE();
}
void
tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
{
struct callout *t_callout;
tp->tt_flags |= TT_STOPPED;
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->tt_delack;
break;
case TT_REXMT:
t_callout = &tp->tt_rexmt;
break;
case TT_PERSIST:
t_callout = &tp->tt_persist;
break;
case TT_KEEP:
t_callout = &tp->tt_keep;
break;
case TT_2MSL:
t_callout = &tp->tt_2msl;
break;
default:
if (tp->t_fb->tfb_tcp_timer_stop) {
/*
* XXXrrs we need to look at this with the
* stop case below (flags).
*/
tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
return;
}
panic("tp %p bad timer_type %#x", tp, timer_type);
}
if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
/*
* Can't stop the callout, defer tcpcb actual deletion
* to the last one. We do this using the async drain
* function and incrementing the count in
*/
tp->tt_draincnt++;
kern_yield(PRI_UNCHANGED);
INP_WLOCK(inp);
}
}

View file

@ -145,18 +145,6 @@ static const char *tcptimers[] =
#ifdef _KERNEL
/*
* Flags for the tcpcb's tt_flags field.
*/
#define TT_DELACK 0x0001
#define TT_REXMT 0x0002
#define TT_PERSIST 0x0004
#define TT_KEEP 0x0008
#define TT_2MSL 0x0010
#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
#define TT_STOPPED 0x00010000
#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
#define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
@ -205,13 +193,6 @@ VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss);
VNET_DECLARE(int, tcp_msl);
#define V_tcp_msl VNET(tcp_msl)
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
void tcp_timer_delack(void *xtp);
#endif /* _KERNEL */
#endif /* !_NETINET_TCP_TIMER_H_ */

View file

@ -3072,10 +3072,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
db_print_indent(indent);
db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
&tp->tt_rexmt, &tp->tt_persist, &tp->tt_keep);
db_printf("tt_2msl: %p tt_delack: %p\n", &tp->tt_2msl,
&tp->tt_delack);
db_printf("t_callout: %p t_timers: %p\n",
&tp->t_callout, &tp->t_timers);
db_print_indent(indent);
db_printf("t_state: %d (", tp->t_state);

View file

@ -126,6 +126,15 @@ struct sackhint {
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
typedef enum {
TT_DELACK = 0,
TT_REXMT,
TT_PERSIST,
TT_KEEP,
TT_2MSL,
TT_N,
} tt_which;
/*
* Tcp control block, one per tcp connection.
*/
@ -137,13 +146,9 @@ struct tcpcb {
struct tcp_function_block *t_fb;/* TCP function call block */
void *t_fb_ptr; /* Pointer to t_fb specific data */
struct callout tt_rexmt; /* retransmit timer */
struct callout tt_persist; /* retransmit persistence */
struct callout tt_keep; /* keepalive */
struct callout tt_2msl; /* 2*msl TIME_WAIT timer */
struct callout tt_delack; /* delayed ACK timer */
uint32_t tt_flags; /* Timers flags */
uint32_t tt_draincnt; /* Count being drained */
struct callout t_callout;
sbintime_t t_timers[TT_N];
sbintime_t t_precisions[TT_N];
uint32_t t_maxseg:24, /* maximum segment size */
t_logstate:8; /* State of "black box" logging */
@ -370,10 +375,6 @@ struct tcp_function_block {
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
void (*tfb_tcp_timer_activate)(struct tcpcb *,
uint32_t, u_int);
int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
void (*tfb_tcp_mtu_chg)(struct tcpcb *);
@ -1086,7 +1087,6 @@ int tcp_addoptions(struct tcpopt *, u_char *);
struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);
bool tcp_freecb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
int tcp_ctloutput(struct socket *, struct sockopt *);
void tcp_fini(void *);
@ -1186,9 +1186,9 @@ void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp);
struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *);
void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
int tcp_timer_active(struct tcpcb *, uint32_t);
void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_timer_activate(struct tcpcb *, tt_which, u_int);
bool tcp_timer_active(struct tcpcb *, tt_which);
void tcp_timer_stop(struct tcpcb *);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
int inp_to_cpuid(struct inpcb *inp);
/*

View file

@ -557,7 +557,7 @@ enum {
#define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */
#define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */
#define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */
#define TDP_UNUSED0 0x20000000 /* UNUSED */
#define TDP_INTCPCALLOUT 0x20000000 /* used by netinet/tcp_timer.c */
#define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */
#define TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */