mirror of
https://github.com/freebsd/freebsd-src
synced 2024-10-07 00:50:50 +00:00
lro: separate HPTS specific code into tcp_lro_hpts.c
Put same copyright header as tcp_hpts.c has, since all this code was developed by Randall Stewart <rrs@FreeBSD.org> as a part of the HPTS work. Also copy Mellanox copyright from tcp_lro.c as Hans Petter Selasky also participated in restructuring the code. Reviewed by: imp, tuexen, rrs Differential Revision: https://reviews.freebsd.org/D42854
This commit is contained in:
parent
5b0010b467
commit
4f9c93f16c
|
@ -4354,6 +4354,7 @@ netinet/tcp_hostcache.c optional inet | inet6
|
|||
netinet/tcp_input.c optional inet | inet6
|
||||
netinet/tcp_log_buf.c optional tcp_blackbox inet | tcp_blackbox inet6
|
||||
netinet/tcp_lro.c optional inet | inet6
|
||||
netinet/tcp_lro_hpts.c optional tcphpts inet | tcphpts inet6
|
||||
netinet/tcp_output.c optional inet | inet6
|
||||
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
|
||||
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
.PATH: ${SRCTOP}/sys/netinet
|
||||
|
||||
KMOD= tcphpts
|
||||
SRCS= tcp_hpts.c opt_inet.h opt_inet6.h opt_rss.h device_if.h bus_if.h
|
||||
SRCS= tcp_hpts.c tcp_lro_hpts.c \
|
||||
opt_inet.h opt_inet6.h opt_rss.h device_if.h bus_if.h
|
||||
|
||||
.include <bsd.kmod.mk>
|
||||
|
|
|
@ -80,25 +80,14 @@
|
|||
|
||||
static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
|
||||
|
||||
#define TCP_LRO_TS_OPTION \
|
||||
ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
|
||||
(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
|
||||
|
||||
static void tcp_lro_rx_done(struct lro_ctrl *lc);
|
||||
static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m,
|
||||
uint32_t csum, bool use_hash);
|
||||
|
||||
#ifdef TCPHPTS
|
||||
static bool do_bpf_strip_and_compress(struct tcpcb *, struct lro_ctrl *,
|
||||
struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **,
|
||||
bool *, bool, bool, struct ifnet *, bool);
|
||||
|
||||
#endif
|
||||
|
||||
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
|
||||
"TCP LRO");
|
||||
|
||||
static long tcplro_stacks_wanting_mbufq;
|
||||
long tcplro_stacks_wanting_mbufq;
|
||||
counter_u64_t tcp_inp_lro_direct_queue;
|
||||
counter_u64_t tcp_inp_lro_wokeup_queue;
|
||||
counter_u64_t tcp_inp_lro_compressed;
|
||||
|
@ -487,12 +476,6 @@ tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po)
|
|||
return (TCP_LRO_CANNOT);
|
||||
}
|
||||
|
||||
static struct tcphdr *
|
||||
tcp_lro_get_th(struct mbuf *m)
|
||||
{
|
||||
return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off));
|
||||
}
|
||||
|
||||
static void
|
||||
lro_free_mbuf_chain(struct mbuf *m)
|
||||
{
|
||||
|
@ -680,58 +663,6 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef TCPHPTS
|
||||
static void
|
||||
tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
|
||||
const struct lro_entry *le, const struct mbuf *m,
|
||||
int frm, int32_t tcp_data_len, uint32_t th_seq,
|
||||
uint32_t th_ack, uint16_t th_win)
|
||||
{
|
||||
if (tcp_bblogging_on(tp)) {
|
||||
union tcp_log_stackspecific log;
|
||||
struct timeval tv, btv;
|
||||
uint32_t cts;
|
||||
|
||||
cts = tcp_get_usecs(&tv);
|
||||
memset(&log, 0, sizeof(union tcp_log_stackspecific));
|
||||
log.u_bbr.flex8 = frm;
|
||||
log.u_bbr.flex1 = tcp_data_len;
|
||||
if (m)
|
||||
log.u_bbr.flex2 = m->m_pkthdr.len;
|
||||
else
|
||||
log.u_bbr.flex2 = 0;
|
||||
if (le->m_head) {
|
||||
log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
|
||||
log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
|
||||
log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
|
||||
log.u_bbr.delRate = le->m_head->m_flags;
|
||||
log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
|
||||
}
|
||||
log.u_bbr.inflight = th_seq;
|
||||
log.u_bbr.delivered = th_ack;
|
||||
log.u_bbr.timeStamp = cts;
|
||||
log.u_bbr.epoch = le->next_seq;
|
||||
log.u_bbr.lt_epoch = le->ack_seq;
|
||||
log.u_bbr.pacing_gain = th_win;
|
||||
log.u_bbr.cwnd_gain = le->window;
|
||||
log.u_bbr.lost = curcpu;
|
||||
log.u_bbr.cur_del_rate = (uintptr_t)m;
|
||||
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
|
||||
bintime2timeval(&lc->lro_last_queue_time, &btv);
|
||||
log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
|
||||
log.u_bbr.flex7 = le->compressed;
|
||||
log.u_bbr.pacing_gain = le->uncompressed;
|
||||
if (in_epoch(net_epoch_preempt))
|
||||
log.u_bbr.inhpts = 1;
|
||||
else
|
||||
log.u_bbr.inhpts = 0;
|
||||
TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
|
||||
&tptosocket(tp)->so_snd,
|
||||
TCP_LOG_LRO, 0, 0, &log, false, &tv);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum)
|
||||
{
|
||||
|
@ -1175,276 +1106,6 @@ tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le)
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef TCPHPTS
|
||||
static void
|
||||
tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
|
||||
{
|
||||
|
||||
INP_WLOCK_ASSERT(tptoinpcb(tp));
|
||||
|
||||
STAILQ_HEAD(, mbuf) q = { le->m_head,
|
||||
&STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) };
|
||||
STAILQ_CONCAT(&tp->t_inqueue, &q);
|
||||
le->m_head = NULL;
|
||||
le->m_last_mbuf = NULL;
|
||||
}
|
||||
|
||||
static bool
|
||||
tcp_lro_check_wake_status(struct tcpcb *tp)
|
||||
{
|
||||
|
||||
if (tp->t_fb->tfb_early_wake_check != NULL)
|
||||
return ((tp->t_fb->tfb_early_wake_check)(tp));
|
||||
return (false);
|
||||
}
|
||||
|
||||
static struct mbuf *
|
||||
tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
|
||||
struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp)
|
||||
{
|
||||
struct mbuf *m;
|
||||
|
||||
/* Look at the last mbuf if any in queue */
|
||||
if (can_append_old_cmp) {
|
||||
m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt);
|
||||
if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
|
||||
if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
|
||||
tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
|
||||
*new_m = 0;
|
||||
counter_u64_add(tcp_extra_mbuf, 1);
|
||||
return (m);
|
||||
} else {
|
||||
/* Mark we ran out of space */
|
||||
tp->t_flags2 |= TF2_MBUF_L_ACKS;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Decide mbuf size. */
|
||||
tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
|
||||
if (tp->t_flags2 & TF2_MBUF_L_ACKS)
|
||||
m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
|
||||
else
|
||||
m = m_gethdr(M_NOWAIT, MT_DATA);
|
||||
|
||||
if (__predict_false(m == NULL)) {
|
||||
counter_u64_add(tcp_would_have_but, 1);
|
||||
return (NULL);
|
||||
}
|
||||
counter_u64_add(tcp_comp_total, 1);
|
||||
m->m_pkthdr.rcvif = lc->ifp;
|
||||
m->m_flags |= M_ACKCMP;
|
||||
*new_m = 1;
|
||||
return (m);
|
||||
}
|
||||
|
||||
static struct tcpcb *
|
||||
tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
|
||||
{
|
||||
struct inpcb *inp;
|
||||
|
||||
switch (pa->data.lro_type) {
|
||||
#ifdef INET6
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
inp = in6_pcblookup(&V_tcbinfo,
|
||||
&pa->data.s_addr.v6,
|
||||
pa->data.s_port,
|
||||
&pa->data.d_addr.v6,
|
||||
pa->data.d_port,
|
||||
INPLOOKUP_WLOCKPCB,
|
||||
ifp);
|
||||
break;
|
||||
#endif
|
||||
#ifdef INET
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
inp = in_pcblookup(&V_tcbinfo,
|
||||
pa->data.s_addr.v4,
|
||||
pa->data.s_port,
|
||||
pa->data.d_addr.v4,
|
||||
pa->data.d_port,
|
||||
INPLOOKUP_WLOCKPCB,
|
||||
ifp);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
return (intotcpcb(inp));
|
||||
}
|
||||
|
||||
static inline bool
|
||||
tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
|
||||
{
|
||||
/*
|
||||
* This function returns two bits of valuable information.
|
||||
* a) Is what is present capable of being ack-compressed,
|
||||
* we can ack-compress if there is no options or just
|
||||
* a timestamp option, and of course the th_flags must
|
||||
* be correct as well.
|
||||
* b) Our other options present such as SACK. This is
|
||||
* used to determine if we want to wakeup or not.
|
||||
*/
|
||||
bool ret = true;
|
||||
|
||||
switch (th->th_off << 2) {
|
||||
case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
|
||||
*ppts = (uint32_t *)(th + 1);
|
||||
/* Check if we have only one timestamp option. */
|
||||
if (**ppts == TCP_LRO_TS_OPTION)
|
||||
*other_opts = false;
|
||||
else {
|
||||
*other_opts = true;
|
||||
ret = false;
|
||||
}
|
||||
break;
|
||||
case (sizeof(*th)):
|
||||
/* No options. */
|
||||
*ppts = NULL;
|
||||
*other_opts = false;
|
||||
break;
|
||||
default:
|
||||
*ppts = NULL;
|
||||
*other_opts = true;
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
/* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
|
||||
if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
|
||||
ret = false;
|
||||
/* If it has data on it we cannot compress it */
|
||||
if (m->m_pkthdr.lro_tcp_d_len)
|
||||
ret = false;
|
||||
|
||||
/* ACK flag must be set. */
|
||||
if (!(tcp_get_flags(th) & TH_ACK))
|
||||
ret = false;
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static int
|
||||
tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
|
||||
{
|
||||
struct tcpcb *tp;
|
||||
struct mbuf **pp, *cmp, *mv_to;
|
||||
struct ifnet *lagg_ifp;
|
||||
bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
|
||||
|
||||
/* Check if packet doesn't belongs to our network interface. */
|
||||
if ((tcplro_stacks_wanting_mbufq == 0) ||
|
||||
(le->outer.data.vlan_id != 0) ||
|
||||
(le->inner.data.lro_type != LRO_TYPE_NONE))
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
#ifdef INET6
|
||||
/*
|
||||
* Be proactive about unspecified IPv6 address in source. As
|
||||
* we use all-zero to indicate unbounded/unconnected pcb,
|
||||
* unspecified IPv6 address can be used to confuse us.
|
||||
*
|
||||
* Note that packets with unspecified IPv6 destination is
|
||||
* already dropped in ip6_input.
|
||||
*/
|
||||
if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
|
||||
IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
|
||||
IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
|
||||
return (TCP_LRO_CANNOT);
|
||||
#endif
|
||||
/* Lookup inp, if any. Returns locked TCP inpcb. */
|
||||
tp = tcp_lro_lookup(lc->ifp,
|
||||
(le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
|
||||
if (tp == NULL)
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
counter_u64_add(tcp_inp_lro_locks_taken, 1);
|
||||
|
||||
/* Check if the inp is dead, Jim. */
|
||||
if (tp->t_state == TCPS_TIME_WAIT) {
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
return (TCP_LRO_CANNOT);
|
||||
}
|
||||
if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
|
||||
tp->t_lro_cpu = lc->lro_last_cpu;
|
||||
/* Check if the transport doesn't support the needed optimizations. */
|
||||
if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) {
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
return (TCP_LRO_CANNOT);
|
||||
}
|
||||
|
||||
if (tp->t_flags2 & TF2_MBUF_QUEUE_READY)
|
||||
should_wake = false;
|
||||
else
|
||||
should_wake = true;
|
||||
/* Check if packets should be tapped to BPF. */
|
||||
bpf_req = bpf_peers_present(lc->ifp->if_bpf);
|
||||
lagg_bpf_req = false;
|
||||
lagg_ifp = NULL;
|
||||
if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
|
||||
lc->ifp->if_type == IFT_INFINIBANDLAG) {
|
||||
struct lagg_port *lp = lc->ifp->if_lagg;
|
||||
struct lagg_softc *sc = lp->lp_softc;
|
||||
|
||||
lagg_ifp = sc->sc_ifp;
|
||||
if (lagg_ifp != NULL)
|
||||
lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
|
||||
}
|
||||
|
||||
/* Strip and compress all the incoming packets. */
|
||||
can_append_old_cmp = true;
|
||||
cmp = NULL;
|
||||
for (pp = &le->m_head; *pp != NULL; ) {
|
||||
mv_to = NULL;
|
||||
if (do_bpf_strip_and_compress(tp, lc, le, pp,
|
||||
&cmp, &mv_to, &should_wake, bpf_req,
|
||||
lagg_bpf_req, lagg_ifp, can_append_old_cmp) == false) {
|
||||
/* Advance to next mbuf. */
|
||||
pp = &(*pp)->m_nextpkt;
|
||||
/*
|
||||
* Once we have appended we can't look in the pending
|
||||
* inbound packets for a compressed ack to append to.
|
||||
*/
|
||||
can_append_old_cmp = false;
|
||||
/*
|
||||
* Once we append we also need to stop adding to any
|
||||
* compressed ack we were remembering. A new cmp
|
||||
* ack will be required.
|
||||
*/
|
||||
cmp = NULL;
|
||||
tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
|
||||
} else if (mv_to != NULL) {
|
||||
/* We are asked to move pp up */
|
||||
pp = &mv_to->m_nextpkt;
|
||||
tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
|
||||
} else
|
||||
tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
|
||||
}
|
||||
/* Update "m_last_mbuf", if any. */
|
||||
if (pp == &le->m_head)
|
||||
le->m_last_mbuf = *pp;
|
||||
else
|
||||
le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
|
||||
|
||||
/* Check if any data mbufs left. */
|
||||
if (le->m_head != NULL) {
|
||||
counter_u64_add(tcp_inp_lro_direct_queue, 1);
|
||||
tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1);
|
||||
tcp_queue_pkts(tp, le);
|
||||
}
|
||||
if (should_wake) {
|
||||
/* Wakeup */
|
||||
counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
|
||||
if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0))
|
||||
/* TCP cb gone and unlocked. */
|
||||
return (0);
|
||||
}
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
|
||||
return (0); /* Success. */
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
|
||||
{
|
||||
|
@ -1614,205 +1275,6 @@ tcp_lro_flush_all(struct lro_ctrl *lc)
|
|||
lc->lro_mbuf_count = 0;
|
||||
}
|
||||
|
||||
#ifdef TCPHPTS
|
||||
static void
|
||||
build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
|
||||
uint32_t *ts_ptr, uint16_t iptos)
|
||||
{
|
||||
/*
|
||||
* Given a TCP ACK, summarize it down into the small TCP ACK
|
||||
* entry.
|
||||
*/
|
||||
ae->timestamp = m->m_pkthdr.rcv_tstmp;
|
||||
ae->flags = 0;
|
||||
if (m->m_flags & M_TSTMP_LRO)
|
||||
ae->flags |= TSTMP_LRO;
|
||||
else if (m->m_flags & M_TSTMP)
|
||||
ae->flags |= TSTMP_HDWR;
|
||||
ae->seq = ntohl(th->th_seq);
|
||||
ae->ack = ntohl(th->th_ack);
|
||||
ae->flags |= tcp_get_flags(th);
|
||||
if (ts_ptr != NULL) {
|
||||
ae->ts_value = ntohl(ts_ptr[1]);
|
||||
ae->ts_echo = ntohl(ts_ptr[2]);
|
||||
ae->flags |= HAS_TSTMP;
|
||||
}
|
||||
ae->win = ntohs(th->th_win);
|
||||
ae->codepoint = iptos;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
|
||||
* and strip all, but the IPv4/IPv6 header.
|
||||
*/
|
||||
static bool
|
||||
do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc,
|
||||
struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, struct mbuf **mv_to,
|
||||
bool *should_wake, bool bpf_req, bool lagg_bpf_req, struct ifnet *lagg_ifp, bool can_append_old_cmp)
|
||||
{
|
||||
union {
|
||||
void *ptr;
|
||||
struct ip *ip4;
|
||||
struct ip6_hdr *ip6;
|
||||
} l3;
|
||||
struct mbuf *m;
|
||||
struct mbuf *nm;
|
||||
struct tcphdr *th;
|
||||
struct tcp_ackent *ack_ent;
|
||||
uint32_t *ts_ptr;
|
||||
int32_t n_mbuf;
|
||||
bool other_opts, can_compress;
|
||||
uint8_t lro_type;
|
||||
uint16_t iptos;
|
||||
int tcp_hdr_offset;
|
||||
int idx;
|
||||
|
||||
/* Get current mbuf. */
|
||||
m = *pp;
|
||||
|
||||
/* Let the BPF see the packet */
|
||||
if (__predict_false(bpf_req))
|
||||
ETHER_BPF_MTAP(lc->ifp, m);
|
||||
|
||||
if (__predict_false(lagg_bpf_req))
|
||||
ETHER_BPF_MTAP(lagg_ifp, m);
|
||||
|
||||
tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
|
||||
lro_type = le->inner.data.lro_type;
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_NONE:
|
||||
lro_type = le->outer.data.lro_type;
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip4);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IP;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip6);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
|
||||
break;
|
||||
default:
|
||||
goto compressed;
|
||||
}
|
||||
break;
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip4);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IP;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip6);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
|
||||
break;
|
||||
default:
|
||||
goto compressed;
|
||||
}
|
||||
|
||||
MPASS(tcp_hdr_offset >= 0);
|
||||
|
||||
m_adj(m, tcp_hdr_offset);
|
||||
m->m_flags |= M_LRO_EHDRSTRP;
|
||||
m->m_flags &= ~M_ACKCMP;
|
||||
m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
|
||||
|
||||
th = tcp_lro_get_th(m);
|
||||
|
||||
th->th_sum = 0; /* TCP checksum is valid. */
|
||||
|
||||
/* Check if ACK can be compressed */
|
||||
can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
|
||||
|
||||
/* Now lets look at the should wake states */
|
||||
if ((other_opts == true) &&
|
||||
((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) {
|
||||
/*
|
||||
* If there are other options (SACK?) and the
|
||||
* tcp endpoint has not expressly told us it does
|
||||
* not care about SACKS, then we should wake up.
|
||||
*/
|
||||
*should_wake = true;
|
||||
} else if (*should_wake == false) {
|
||||
/* Wakeup override check if we are false here */
|
||||
*should_wake = tcp_lro_check_wake_status(tp);
|
||||
}
|
||||
/* Is the ack compressable? */
|
||||
if (can_compress == false)
|
||||
goto done;
|
||||
/* Does the TCP endpoint support ACK compression? */
|
||||
if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0)
|
||||
goto done;
|
||||
|
||||
/* Lets get the TOS/traffic class field */
|
||||
l3.ptr = mtod(m, void *);
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
iptos = l3.ip4->ip_tos;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
|
||||
break;
|
||||
default:
|
||||
iptos = 0; /* Keep compiler happy. */
|
||||
break;
|
||||
}
|
||||
/* Now lets get space if we don't have some already */
|
||||
if (*cmp == NULL) {
|
||||
new_one:
|
||||
nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf,
|
||||
can_append_old_cmp);
|
||||
if (__predict_false(nm == NULL))
|
||||
goto done;
|
||||
*cmp = nm;
|
||||
if (n_mbuf) {
|
||||
/*
|
||||
* Link in the new cmp ack to our in-order place,
|
||||
* first set our cmp ack's next to where we are.
|
||||
*/
|
||||
nm->m_nextpkt = m;
|
||||
(*pp) = nm;
|
||||
/*
|
||||
* Set it up so mv_to is advanced to our
|
||||
* compressed ack. This way the caller can
|
||||
* advance pp to the right place.
|
||||
*/
|
||||
*mv_to = nm;
|
||||
/*
|
||||
* Advance it here locally as well.
|
||||
*/
|
||||
pp = &nm->m_nextpkt;
|
||||
}
|
||||
} else {
|
||||
/* We have one already we are working on */
|
||||
nm = *cmp;
|
||||
if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
|
||||
/* We ran out of space */
|
||||
tp->t_flags2 |= TF2_MBUF_L_ACKS;
|
||||
goto new_one;
|
||||
}
|
||||
}
|
||||
MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
|
||||
counter_u64_add(tcp_inp_lro_compressed, 1);
|
||||
le->compressed++;
|
||||
/* We can add in to the one on the tail */
|
||||
ack_ent = mtod(nm, struct tcp_ackent *);
|
||||
idx = (nm->m_len / sizeof(struct tcp_ackent));
|
||||
build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
|
||||
|
||||
/* Bump the size of both pkt-hdr and len */
|
||||
nm->m_len += sizeof(struct tcp_ackent);
|
||||
nm->m_pkthdr.len += sizeof(struct tcp_ackent);
|
||||
compressed:
|
||||
/* Advance to next mbuf before freeing. */
|
||||
*pp = m->m_nextpkt;
|
||||
m->m_nextpkt = NULL;
|
||||
m_freem(m);
|
||||
return (true);
|
||||
done:
|
||||
counter_u64_add(tcp_uncomp_total, 1);
|
||||
le->uncompressed++;
|
||||
return (false);
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct lro_head *
|
||||
tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser)
|
||||
{
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h>
|
||||
|
||||
#include <sys/mbuf.h>
|
||||
#include <netinet/in.h>
|
||||
|
||||
#ifndef TCP_LRO_ENTRIES
|
||||
|
@ -200,12 +200,25 @@ struct tcp_ackent {
|
|||
#define TCP_LRO_LENGTH_MAX (65535 - 255) /* safe value with room for outer headers */
|
||||
#define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */
|
||||
|
||||
#define TCP_LRO_TS_OPTION ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |\
|
||||
(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
|
||||
|
||||
static inline struct tcphdr *
|
||||
tcp_lro_get_th(struct mbuf *m)
|
||||
{
|
||||
return ((struct tcphdr *)((char *)m->m_data +
|
||||
m->m_pkthdr.lro_tcp_h_off));
|
||||
}
|
||||
|
||||
extern long tcplro_stacks_wanting_mbufq;
|
||||
|
||||
int tcp_lro_init(struct lro_ctrl *);
|
||||
int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned);
|
||||
void tcp_lro_free(struct lro_ctrl *);
|
||||
void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *);
|
||||
void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
|
||||
void tcp_lro_flush_all(struct lro_ctrl *);
|
||||
int tcp_lro_flush_tcphpts(struct lro_ctrl *, struct lro_entry *);
|
||||
int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
|
||||
void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *);
|
||||
void tcp_lro_reg_mbufq(void);
|
||||
|
|
577
sys/netinet/tcp_lro_hpts.c
Normal file
577
sys/netinet/tcp_lro_hpts.c
Normal file
|
@ -0,0 +1,577 @@
|
|||
/*-
|
||||
* Copyright (c) 2016-2018 Netflix, Inc.
|
||||
* Copyright (c) 2016-2021 Mellanox Technologies.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
#include <sys/cdefs.h>
|
||||
#include "opt_inet.h"
|
||||
#include "opt_inet6.h"
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mbuf.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/socketvar.h>
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
#include <net/if.h>
|
||||
#include <net/if_var.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <net/bpf.h>
|
||||
#include <net/vnet.h>
|
||||
#include <net/if_dl.h>
|
||||
#include <net/if_media.h>
|
||||
#include <net/if_types.h>
|
||||
#include <net/infiniband.h>
|
||||
#include <net/if_lagg.h>
|
||||
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/ip6.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/ip_var.h>
|
||||
#include <netinet/in_pcb.h>
|
||||
#include <netinet6/in6_pcb.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/tcp_lro.h>
|
||||
#include <netinet/tcp_var.h>
|
||||
#include <netinet/tcp_hpts.h>
|
||||
#include <netinet/tcp_log_buf.h>
|
||||
|
||||
static void
|
||||
build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
|
||||
uint32_t *ts_ptr, uint16_t iptos)
|
||||
{
|
||||
/*
|
||||
* Given a TCP ACK, summarize it down into the small TCP ACK
|
||||
* entry.
|
||||
*/
|
||||
ae->timestamp = m->m_pkthdr.rcv_tstmp;
|
||||
ae->flags = 0;
|
||||
if (m->m_flags & M_TSTMP_LRO)
|
||||
ae->flags |= TSTMP_LRO;
|
||||
else if (m->m_flags & M_TSTMP)
|
||||
ae->flags |= TSTMP_HDWR;
|
||||
ae->seq = ntohl(th->th_seq);
|
||||
ae->ack = ntohl(th->th_ack);
|
||||
ae->flags |= tcp_get_flags(th);
|
||||
if (ts_ptr != NULL) {
|
||||
ae->ts_value = ntohl(ts_ptr[1]);
|
||||
ae->ts_echo = ntohl(ts_ptr[2]);
|
||||
ae->flags |= HAS_TSTMP;
|
||||
}
|
||||
ae->win = ntohs(th->th_win);
|
||||
ae->codepoint = iptos;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
|
||||
{
|
||||
/*
|
||||
* This function returns two bits of valuable information.
|
||||
* a) Is what is present capable of being ack-compressed,
|
||||
* we can ack-compress if there is no options or just
|
||||
* a timestamp option, and of course the th_flags must
|
||||
* be correct as well.
|
||||
* b) Our other options present such as SACK. This is
|
||||
* used to determine if we want to wakeup or not.
|
||||
*/
|
||||
bool ret = true;
|
||||
|
||||
switch (th->th_off << 2) {
|
||||
case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
|
||||
*ppts = (uint32_t *)(th + 1);
|
||||
/* Check if we have only one timestamp option. */
|
||||
if (**ppts == TCP_LRO_TS_OPTION)
|
||||
*other_opts = false;
|
||||
else {
|
||||
*other_opts = true;
|
||||
ret = false;
|
||||
}
|
||||
break;
|
||||
case (sizeof(*th)):
|
||||
/* No options. */
|
||||
*ppts = NULL;
|
||||
*other_opts = false;
|
||||
break;
|
||||
default:
|
||||
*ppts = NULL;
|
||||
*other_opts = true;
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
/* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
|
||||
if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
|
||||
ret = false;
|
||||
/* If it has data on it we cannot compress it */
|
||||
if (m->m_pkthdr.lro_tcp_d_len)
|
||||
ret = false;
|
||||
|
||||
/* ACK flag must be set. */
|
||||
if (!(tcp_get_flags(th) & TH_ACK))
|
||||
ret = false;
|
||||
return (ret);
|
||||
}
|
||||
|
||||
static bool
|
||||
tcp_lro_check_wake_status(struct tcpcb *tp)
|
||||
{
|
||||
|
||||
if (tp->t_fb->tfb_early_wake_check != NULL)
|
||||
return ((tp->t_fb->tfb_early_wake_check)(tp));
|
||||
return (false);
|
||||
}
|
||||
|
||||
static void
|
||||
tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
|
||||
const struct lro_entry *le, const struct mbuf *m,
|
||||
int frm, int32_t tcp_data_len, uint32_t th_seq,
|
||||
uint32_t th_ack, uint16_t th_win)
|
||||
{
|
||||
if (tcp_bblogging_on(tp)) {
|
||||
union tcp_log_stackspecific log;
|
||||
struct timeval tv, btv;
|
||||
uint32_t cts;
|
||||
|
||||
cts = tcp_get_usecs(&tv);
|
||||
memset(&log, 0, sizeof(union tcp_log_stackspecific));
|
||||
log.u_bbr.flex8 = frm;
|
||||
log.u_bbr.flex1 = tcp_data_len;
|
||||
if (m)
|
||||
log.u_bbr.flex2 = m->m_pkthdr.len;
|
||||
else
|
||||
log.u_bbr.flex2 = 0;
|
||||
if (le->m_head) {
|
||||
log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
|
||||
log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
|
||||
log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
|
||||
log.u_bbr.delRate = le->m_head->m_flags;
|
||||
log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
|
||||
}
|
||||
log.u_bbr.inflight = th_seq;
|
||||
log.u_bbr.delivered = th_ack;
|
||||
log.u_bbr.timeStamp = cts;
|
||||
log.u_bbr.epoch = le->next_seq;
|
||||
log.u_bbr.lt_epoch = le->ack_seq;
|
||||
log.u_bbr.pacing_gain = th_win;
|
||||
log.u_bbr.cwnd_gain = le->window;
|
||||
log.u_bbr.lost = curcpu;
|
||||
log.u_bbr.cur_del_rate = (uintptr_t)m;
|
||||
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
|
||||
bintime2timeval(&lc->lro_last_queue_time, &btv);
|
||||
log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
|
||||
log.u_bbr.flex7 = le->compressed;
|
||||
log.u_bbr.pacing_gain = le->uncompressed;
|
||||
if (in_epoch(net_epoch_preempt))
|
||||
log.u_bbr.inhpts = 1;
|
||||
else
|
||||
log.u_bbr.inhpts = 0;
|
||||
TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
|
||||
&tptosocket(tp)->so_snd,
|
||||
TCP_LOG_LRO, 0, 0, &log, false, &tv);
|
||||
}
|
||||
}
|
||||
|
||||
static struct mbuf *
|
||||
tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
|
||||
struct tcpcb *tp, int32_t *new_m, bool can_append_old_cmp)
|
||||
{
|
||||
struct mbuf *m;
|
||||
|
||||
/* Look at the last mbuf if any in queue */
|
||||
if (can_append_old_cmp) {
|
||||
m = STAILQ_LAST(&tp->t_inqueue, mbuf, m_stailqpkt);
|
||||
if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
|
||||
if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
|
||||
tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
|
||||
*new_m = 0;
|
||||
counter_u64_add(tcp_extra_mbuf, 1);
|
||||
return (m);
|
||||
} else {
|
||||
/* Mark we ran out of space */
|
||||
tp->t_flags2 |= TF2_MBUF_L_ACKS;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Decide mbuf size. */
|
||||
tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
|
||||
if (tp->t_flags2 & TF2_MBUF_L_ACKS)
|
||||
m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
|
||||
else
|
||||
m = m_gethdr(M_NOWAIT, MT_DATA);
|
||||
|
||||
if (__predict_false(m == NULL)) {
|
||||
counter_u64_add(tcp_would_have_but, 1);
|
||||
return (NULL);
|
||||
}
|
||||
counter_u64_add(tcp_comp_total, 1);
|
||||
m->m_pkthdr.rcvif = lc->ifp;
|
||||
m->m_flags |= M_ACKCMP;
|
||||
*new_m = 1;
|
||||
return (m);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
|
||||
* and strip all, but the IPv4/IPv6 header.
|
||||
*/
|
||||
static bool
|
||||
do_bpf_strip_and_compress(struct tcpcb *tp, struct lro_ctrl *lc,
|
||||
struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp,
|
||||
struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req,
|
||||
struct ifnet *lagg_ifp, bool can_append_old_cmp)
|
||||
{
|
||||
union {
|
||||
void *ptr;
|
||||
struct ip *ip4;
|
||||
struct ip6_hdr *ip6;
|
||||
} l3;
|
||||
struct mbuf *m;
|
||||
struct mbuf *nm;
|
||||
struct tcphdr *th;
|
||||
struct tcp_ackent *ack_ent;
|
||||
uint32_t *ts_ptr;
|
||||
int32_t n_mbuf;
|
||||
bool other_opts, can_compress;
|
||||
uint8_t lro_type;
|
||||
uint16_t iptos;
|
||||
int tcp_hdr_offset;
|
||||
int idx;
|
||||
|
||||
/* Get current mbuf. */
|
||||
m = *pp;
|
||||
|
||||
/* Let the BPF see the packet */
|
||||
if (__predict_false(bpf_req))
|
||||
ETHER_BPF_MTAP(lc->ifp, m);
|
||||
|
||||
if (__predict_false(lagg_bpf_req))
|
||||
ETHER_BPF_MTAP(lagg_ifp, m);
|
||||
|
||||
tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
|
||||
lro_type = le->inner.data.lro_type;
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_NONE:
|
||||
lro_type = le->outer.data.lro_type;
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip4);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IP;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip6);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
|
||||
break;
|
||||
default:
|
||||
goto compressed;
|
||||
}
|
||||
break;
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip4);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IP;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
tcp_hdr_offset -= sizeof(*le->outer.ip6);
|
||||
m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
|
||||
break;
|
||||
default:
|
||||
goto compressed;
|
||||
}
|
||||
|
||||
MPASS(tcp_hdr_offset >= 0);
|
||||
|
||||
m_adj(m, tcp_hdr_offset);
|
||||
m->m_flags |= M_LRO_EHDRSTRP;
|
||||
m->m_flags &= ~M_ACKCMP;
|
||||
m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
|
||||
|
||||
th = tcp_lro_get_th(m);
|
||||
|
||||
th->th_sum = 0; /* TCP checksum is valid. */
|
||||
|
||||
/* Check if ACK can be compressed */
|
||||
can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
|
||||
|
||||
/* Now lets look at the should wake states */
|
||||
if ((other_opts == true) &&
|
||||
((tp->t_flags2 & TF2_DONT_SACK_QUEUE) == 0)) {
|
||||
/*
|
||||
* If there are other options (SACK?) and the
|
||||
* tcp endpoint has not expressly told us it does
|
||||
* not care about SACKS, then we should wake up.
|
||||
*/
|
||||
*should_wake = true;
|
||||
} else if (*should_wake == false) {
|
||||
/* Wakeup override check if we are false here */
|
||||
*should_wake = tcp_lro_check_wake_status(tp);
|
||||
}
|
||||
/* Is the ack compressable? */
|
||||
if (can_compress == false)
|
||||
goto done;
|
||||
/* Does the TCP endpoint support ACK compression? */
|
||||
if ((tp->t_flags2 & TF2_MBUF_ACKCMP) == 0)
|
||||
goto done;
|
||||
|
||||
/* Lets get the TOS/traffic class field */
|
||||
l3.ptr = mtod(m, void *);
|
||||
switch (lro_type) {
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
iptos = l3.ip4->ip_tos;
|
||||
break;
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
|
||||
break;
|
||||
default:
|
||||
iptos = 0; /* Keep compiler happy. */
|
||||
break;
|
||||
}
|
||||
/* Now lets get space if we don't have some already */
|
||||
if (*cmp == NULL) {
|
||||
new_one:
|
||||
nm = tcp_lro_get_last_if_ackcmp(lc, le, tp, &n_mbuf,
|
||||
can_append_old_cmp);
|
||||
if (__predict_false(nm == NULL))
|
||||
goto done;
|
||||
*cmp = nm;
|
||||
if (n_mbuf) {
|
||||
/*
|
||||
* Link in the new cmp ack to our in-order place,
|
||||
* first set our cmp ack's next to where we are.
|
||||
*/
|
||||
nm->m_nextpkt = m;
|
||||
(*pp) = nm;
|
||||
/*
|
||||
* Set it up so mv_to is advanced to our
|
||||
* compressed ack. This way the caller can
|
||||
* advance pp to the right place.
|
||||
*/
|
||||
*mv_to = nm;
|
||||
/*
|
||||
* Advance it here locally as well.
|
||||
*/
|
||||
pp = &nm->m_nextpkt;
|
||||
}
|
||||
} else {
|
||||
/* We have one already we are working on */
|
||||
nm = *cmp;
|
||||
if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
|
||||
/* We ran out of space */
|
||||
tp->t_flags2 |= TF2_MBUF_L_ACKS;
|
||||
goto new_one;
|
||||
}
|
||||
}
|
||||
MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
|
||||
counter_u64_add(tcp_inp_lro_compressed, 1);
|
||||
le->compressed++;
|
||||
/* We can add in to the one on the tail */
|
||||
ack_ent = mtod(nm, struct tcp_ackent *);
|
||||
idx = (nm->m_len / sizeof(struct tcp_ackent));
|
||||
build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
|
||||
|
||||
/* Bump the size of both pkt-hdr and len */
|
||||
nm->m_len += sizeof(struct tcp_ackent);
|
||||
nm->m_pkthdr.len += sizeof(struct tcp_ackent);
|
||||
compressed:
|
||||
/* Advance to next mbuf before freeing. */
|
||||
*pp = m->m_nextpkt;
|
||||
m->m_nextpkt = NULL;
|
||||
m_freem(m);
|
||||
return (true);
|
||||
done:
|
||||
counter_u64_add(tcp_uncomp_total, 1);
|
||||
le->uncompressed++;
|
||||
return (false);
|
||||
}
|
||||
|
||||
static void
|
||||
tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le)
|
||||
{
|
||||
|
||||
INP_WLOCK_ASSERT(tptoinpcb(tp));
|
||||
|
||||
STAILQ_HEAD(, mbuf) q = { le->m_head,
|
||||
&STAILQ_NEXT(le->m_last_mbuf, m_stailqpkt) };
|
||||
STAILQ_CONCAT(&tp->t_inqueue, &q);
|
||||
le->m_head = NULL;
|
||||
le->m_last_mbuf = NULL;
|
||||
}
|
||||
|
||||
static struct tcpcb *
|
||||
tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
|
||||
{
|
||||
struct inpcb *inp;
|
||||
|
||||
switch (pa->data.lro_type) {
|
||||
#ifdef INET6
|
||||
case LRO_TYPE_IPV6_TCP:
|
||||
inp = in6_pcblookup(&V_tcbinfo,
|
||||
&pa->data.s_addr.v6,
|
||||
pa->data.s_port,
|
||||
&pa->data.d_addr.v6,
|
||||
pa->data.d_port,
|
||||
INPLOOKUP_WLOCKPCB,
|
||||
ifp);
|
||||
break;
|
||||
#endif
|
||||
#ifdef INET
|
||||
case LRO_TYPE_IPV4_TCP:
|
||||
inp = in_pcblookup(&V_tcbinfo,
|
||||
pa->data.s_addr.v4,
|
||||
pa->data.s_port,
|
||||
pa->data.d_addr.v4,
|
||||
pa->data.d_port,
|
||||
INPLOOKUP_WLOCKPCB,
|
||||
ifp);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
return (intotcpcb(inp));
|
||||
}
|
||||
|
||||
int
|
||||
tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
|
||||
{
|
||||
struct tcpcb *tp;
|
||||
struct mbuf **pp, *cmp, *mv_to;
|
||||
struct ifnet *lagg_ifp;
|
||||
bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
|
||||
|
||||
/* Check if packet doesn't belongs to our network interface. */
|
||||
if ((tcplro_stacks_wanting_mbufq == 0) ||
|
||||
(le->outer.data.vlan_id != 0) ||
|
||||
(le->inner.data.lro_type != LRO_TYPE_NONE))
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
#ifdef INET6
|
||||
/*
|
||||
* Be proactive about unspecified IPv6 address in source. As
|
||||
* we use all-zero to indicate unbounded/unconnected pcb,
|
||||
* unspecified IPv6 address can be used to confuse us.
|
||||
*
|
||||
* Note that packets with unspecified IPv6 destination is
|
||||
* already dropped in ip6_input.
|
||||
*/
|
||||
if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
|
||||
IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
|
||||
IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
|
||||
return (TCP_LRO_CANNOT);
|
||||
#endif
|
||||
/* Lookup inp, if any. Returns locked TCP inpcb. */
|
||||
tp = tcp_lro_lookup(lc->ifp,
|
||||
(le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
|
||||
if (tp == NULL)
|
||||
return (TCP_LRO_CANNOT);
|
||||
|
||||
counter_u64_add(tcp_inp_lro_locks_taken, 1);
|
||||
|
||||
/* Check if the inp is dead, Jim. */
|
||||
if (tp->t_state == TCPS_TIME_WAIT) {
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
return (TCP_LRO_CANNOT);
|
||||
}
|
||||
if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
|
||||
tp->t_lro_cpu = lc->lro_last_cpu;
|
||||
/* Check if the transport doesn't support the needed optimizations. */
|
||||
if ((tp->t_flags2 & (TF2_SUPPORTS_MBUFQ | TF2_MBUF_ACKCMP)) == 0) {
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
return (TCP_LRO_CANNOT);
|
||||
}
|
||||
|
||||
if (tp->t_flags2 & TF2_MBUF_QUEUE_READY)
|
||||
should_wake = false;
|
||||
else
|
||||
should_wake = true;
|
||||
/* Check if packets should be tapped to BPF. */
|
||||
bpf_req = bpf_peers_present(lc->ifp->if_bpf);
|
||||
lagg_bpf_req = false;
|
||||
lagg_ifp = NULL;
|
||||
if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
|
||||
lc->ifp->if_type == IFT_INFINIBANDLAG) {
|
||||
struct lagg_port *lp = lc->ifp->if_lagg;
|
||||
struct lagg_softc *sc = lp->lp_softc;
|
||||
|
||||
lagg_ifp = sc->sc_ifp;
|
||||
if (lagg_ifp != NULL)
|
||||
lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
|
||||
}
|
||||
|
||||
/* Strip and compress all the incoming packets. */
|
||||
can_append_old_cmp = true;
|
||||
cmp = NULL;
|
||||
for (pp = &le->m_head; *pp != NULL; ) {
|
||||
mv_to = NULL;
|
||||
if (do_bpf_strip_and_compress(tp, lc, le, pp, &cmp, &mv_to,
|
||||
&should_wake, bpf_req, lagg_bpf_req, lagg_ifp,
|
||||
can_append_old_cmp) == false) {
|
||||
/* Advance to next mbuf. */
|
||||
pp = &(*pp)->m_nextpkt;
|
||||
/*
|
||||
* Once we have appended we can't look in the pending
|
||||
* inbound packets for a compressed ack to append to.
|
||||
*/
|
||||
can_append_old_cmp = false;
|
||||
/*
|
||||
* Once we append we also need to stop adding to any
|
||||
* compressed ack we were remembering. A new cmp
|
||||
* ack will be required.
|
||||
*/
|
||||
cmp = NULL;
|
||||
tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
|
||||
} else if (mv_to != NULL) {
|
||||
/* We are asked to move pp up */
|
||||
pp = &mv_to->m_nextpkt;
|
||||
tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
|
||||
} else
|
||||
tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
|
||||
}
|
||||
/* Update "m_last_mbuf", if any. */
|
||||
if (pp == &le->m_head)
|
||||
le->m_last_mbuf = *pp;
|
||||
else
|
||||
le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
|
||||
|
||||
/* Check if any data mbufs left. */
|
||||
if (le->m_head != NULL) {
|
||||
counter_u64_add(tcp_inp_lro_direct_queue, 1);
|
||||
tcp_lro_log(tp, lc, le, NULL, 22, 1, tp->t_flags2, 0, 1);
|
||||
tcp_queue_pkts(tp, le);
|
||||
}
|
||||
if (should_wake) {
|
||||
/* Wakeup */
|
||||
counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
|
||||
if ((*tp->t_fb->tfb_do_queued_segments)(tp, 0))
|
||||
/* TCP cb gone and unlocked. */
|
||||
return (0);
|
||||
}
|
||||
INP_WUNLOCK(tptoinpcb(tp));
|
||||
|
||||
return (0); /* Success. */
|
||||
}
|
Loading…
Reference in a new issue