cxgbe: Support TCP_USE_DDP on offloaded TOE connections

When this socket option is enabled, relatively large contiguous
buffers are allocated and used to receive data from the remote
connection.  When data is received a wrapper M_EXT mbuf is queued to
the socket's receive buffer.  This reduces the length of the linked
list of received mbufs and allows consumers to consume receive data in
larger chunks.

To minimize reprogramming the page pods in the adapter, receive
buffers for a given connection are recycled.  When a buffer has been
fully consumed by the receiver and freed, the buffer is placed on a
per-connection free buffers list.

The size of the receive buffers defaults to 256k and can be set via
the hw.cxgbe.toe.ddp_rcvbuf_len sysctl.  The
hw.cxgbe.toe.ddp_rcvbuf_cache sysctl (defaults to 4) determines the
maximum number of free buffers cached per connection.  Note that this
limit does not apply to "in-flight" receive buffers that are
associated with mbufs in the socket's receive buffer.

Co-authored-by:	Navdeep Parhar <np@FreeBSD.org>
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44001
This commit is contained in:
John Baldwin 2024-03-20 15:29:28 -07:00
parent 3d0a736796
commit eba13bbc37
7 changed files with 854 additions and 60 deletions

View file

@ -690,6 +690,10 @@ struct sge_ofld_rxq {
uint64_t rx_aio_ddp_octets;
u_long rx_toe_tls_records;
u_long rx_toe_tls_octets;
u_long rx_toe_ddp_octets;
counter_u64_t ddp_buffer_alloc;
counter_u64_t ddp_buffer_reuse;
counter_u64_t ddp_buffer_free;
} __aligned(CACHE_LINE_SIZE);
static inline struct sge_ofld_rxq *
@ -1344,6 +1348,8 @@ extern int t4_tmr_idx;
extern int t4_pktc_idx;
extern unsigned int t4_qsize_rxq;
extern unsigned int t4_qsize_txq;
extern int t4_ddp_rcvbuf_len;
extern unsigned int t4_ddp_rcvbuf_cache;
extern device_method_t cxgbe_methods[];
int t4_os_find_pci_capability(struct adapter *, int);

View file

@ -412,6 +412,15 @@ SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN,
&t4_toe_rexmt_backoff[14], 0, "");
SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
&t4_toe_rexmt_backoff[15], 0, "");
int t4_ddp_rcvbuf_len = 256 * 1024;
SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_len, CTLFLAG_RWTUN,
&t4_ddp_rcvbuf_len, 0, "length of each DDP RX buffer");
unsigned int t4_ddp_rcvbuf_cache = 4;
SYSCTL_UINT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_cache, CTLFLAG_RWTUN,
&t4_ddp_rcvbuf_cache, 0,
"maximum number of free DDP RX buffers to cache per connection");
#endif
#ifdef DEV_NETMAP
@ -12046,6 +12055,10 @@ clear_stats(struct adapter *sc, u_int port_id)
ofld_rxq->rx_aio_ddp_octets = 0;
ofld_rxq->rx_toe_tls_records = 0;
ofld_rxq->rx_toe_tls_octets = 0;
ofld_rxq->rx_toe_ddp_octets = 0;
counter_u64_zero(ofld_rxq->ddp_buffer_alloc);
counter_u64_zero(ofld_rxq->ddp_buffer_reuse);
counter_u64_zero(ofld_rxq->ddp_buffer_free);
}
#endif

View file

@ -4098,6 +4098,9 @@ alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int idx,
ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK);
ofld_rxq->rx_iscsi_ddp_setup_error =
counter_u64_alloc(M_WAITOK);
ofld_rxq->ddp_buffer_alloc = counter_u64_alloc(M_WAITOK);
ofld_rxq->ddp_buffer_reuse = counter_u64_alloc(M_WAITOK);
ofld_rxq->ddp_buffer_free = counter_u64_alloc(M_WAITOK);
add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq);
}
@ -4132,6 +4135,9 @@ free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED));
counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok);
counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error);
counter_u64_free(ofld_rxq->ddp_buffer_alloc);
counter_u64_free(ofld_rxq->ddp_buffer_reuse);
counter_u64_free(ofld_rxq->ddp_buffer_free);
bzero(ofld_rxq, sizeof(*ofld_rxq));
}
}
@ -4158,6 +4164,18 @@ add_ofld_rxq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
"rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets,
"# of payload octets in received TOE TLS records");
SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
"rx_toe_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_ddp_octets,
"# of payload octets received via TCP DDP");
SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
"ddp_buffer_alloc", CTLFLAG_RD, &ofld_rxq->ddp_buffer_alloc,
"# of DDP RCV buffers allocated");
SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
"ddp_buffer_reuse", CTLFLAG_RD, &ofld_rxq->ddp_buffer_reuse,
"# of DDP RCV buffers reused");
SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
"ddp_buffer_free", CTLFLAG_RD, &ofld_rxq->ddp_buffer_free,
"# of DDP RCV buffers freed");
oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics");

View file

@ -1352,8 +1352,6 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
if (toep->flags & TPF_ABORT_SHUTDOWN)
goto done;
so = inp->inp_socket;
socantrcvmore(so);
if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
DDP_LOCK(toep);
if (__predict_false(toep->ddp.flags &
@ -1361,6 +1359,8 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
handle_ddp_close(toep, tp, cpl->rcv_nxt);
DDP_UNLOCK(toep);
}
so = inp->inp_socket;
socantrcvmore(so);
if (ulp_mode(toep) == ULP_MODE_RDMA ||
(ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
@ -1782,7 +1782,8 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
sbappendstream_locked(sb, m, 0);
t4_rcvd_locked(&toep->td->tod, tp);
if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
if (ulp_mode(toep) == ULP_MODE_TCPDDP &&
(toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 &&
sbavail(sb) != 0) {
CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
tid);

File diff suppressed because it is too large Load diff

View file

@ -1950,6 +1950,35 @@ t4_tom_deactivate(struct adapter *sc)
return (rc);
}
static int
t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
{
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
int error, optval;
if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) {
if (sopt->sopt_dir != SOPT_SET)
return (EOPNOTSUPP);
if (sopt->sopt_td != NULL) {
/* Only settable by the kernel. */
return (EPERM);
}
error = sooptcopyin(sopt, &optval, sizeof(optval),
sizeof(optval));
if (error != 0)
return (error);
if (optval != 0)
return (t4_enable_ddp_rcv(so, toep));
else
return (EOPNOTSUPP);
}
return (tcp_ctloutput(so, sopt));
}
static int
t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
{
@ -1989,9 +2018,11 @@ t4_tom_mod_load(void)
t4_tls_mod_load();
bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
toe_protosw.pr_ctloutput = t4_ctloutput_tom;
toe_protosw.pr_aio_queue = t4_aio_queue_tom;
bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
return (t4_register_uld(&tom_uld_info));

View file

@ -85,6 +85,8 @@ enum {
DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */
DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */
DDP_DEAD = (1 << 6), /* toepcb is shutting down */
DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */
DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */
};
struct bio;
@ -156,25 +158,51 @@ TAILQ_HEAD(pagesetq, pageset);
#define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */
struct ddp_buffer {
struct pageset *ps;
struct kaiocb *job;
int cancel_pending;
struct ddp_rcv_buffer {
TAILQ_ENTRY(ddp_rcv_buffer) link;
void *buf;
struct ppod_reservation prsv;
size_t len;
u_int refs;
};
struct ddp_buffer {
union {
/* DDP_AIO fields */
struct {
struct pageset *ps;
struct kaiocb *job;
int cancel_pending;
};
/* DDP_RCVBUF fields */
struct {
struct ddp_rcv_buffer *drb;
uint32_t placed;
};
};
};
/*
* (a) - DDP_AIO only
* (r) - DDP_RCVBUF only
*/
struct ddp_pcb {
struct mtx lock;
u_int flags;
int active_id; /* the currently active DDP buffer */
struct ddp_buffer db[2];
TAILQ_HEAD(, pageset) cached_pagesets;
TAILQ_HEAD(, kaiocb) aiojobq;
u_int waiting_count;
union {
TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */
TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */
};
TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */
u_int waiting_count; /* (a) */
u_int active_count;
u_int cached_count;
int active_id; /* the currently active DDP buffer */
struct task requeue_task;
struct kaiocb *queueing;
struct mtx lock;
struct kaiocb *queueing; /* (a) */
struct mtx cache_lock; /* (r) */
};
struct toepcb {
@ -230,6 +258,8 @@ ulp_mode(struct toepcb *toep)
#define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock)
#define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock)
#define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED)
#define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock)
#define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock)
/*
* Compressed state for embryonic connections for a listener.
@ -502,6 +532,7 @@ int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *,
struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *);
void t4_free_page_pods(struct ppod_reservation *);
int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
int t4_enable_ddp_rcv(struct socket *, struct toepcb *);
void t4_ddp_mod_load(void);
void t4_ddp_mod_unload(void);
void ddp_assert_empty(struct toepcb *);