cxgbe(4): Add support for Connection Offload Policy (aka COP).

COP allows fine-grained control on whether to offload a TCP connection
using t4_tom, and what settings to apply to a connection selected for
offload.  t4_tom must still be loaded and IFCAP_TOE must still be
enabled for full TCP offload to take place on an interface.  The
difference is that IFCAP_TOE used to be the only knob and would enable
TOE for all new connections on the inteface, but now the driver will
also consult the COP, if any, before offloading to the hardware TOE.

A policy is a plain text file with any number of rules, one per line.
Each rule has a "match" part consisting of a socket-type (L = listen,
A = active open, P = passive open, D = don't care) and a pcap-filter(7)
expression, and a "settings" part that specifies whether to offload the
connection or not and the parameters to use if so.  The general format
of a rule is: [socket-type] expr => settings

Example.  See cxgbetool(8) for more information.
[L] ip && port http => offload
[L] port 443 => !offload
[L] port ssh => offload
[P] src net 192.168/16 && dst port ssh => offload !nagle !timestamp cong newreno
[P] dst port ssh => offload !nagle ecn cong tahoe
[P] dst port http => offload
[A] dst port 443 => offload tls
[A] dst net 192.168/16 => offload !timestamp cong highspeed

The driver processes the rules for each new listen, active open, or
passive open and stops at the first match.  There is an implicit rule at
the end of every policy that prohibits offload when no rule in the
policy matches:
[D] all => !offload

This is a reworked and expanded version of a patch submitted by
Krishnamraju Eraparaju @ Chelsio.

Sponsored by:	Chelsio Communications
This commit is contained in:
Navdeep Parhar 2018-04-14 19:07:56 +00:00
parent 23084818ff
commit 1131c927c4
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=332506
13 changed files with 1200 additions and 104 deletions

View file

@ -804,8 +804,11 @@ struct adapter {
void *tom_softc; /* (struct tom_data *) */
struct tom_tunables tt;
struct iw_tunables iwt;
struct t4_offload_policy *policy;
struct rwlock policy_lock;
void *iwarp_softc; /* (struct c4iw_dev *) */
struct iw_tunables iwt;
void *iscsi_ulp_softc; /* (struct cxgbei_data *) */
void *ccr_softc; /* (struct ccr_softc *) */
struct l2t_data *l2t; /* L2 table */

View file

@ -156,6 +156,7 @@ struct tom_tunables {
int num_tls_rx_ports;
int tx_align;
int tx_zcopy;
int cop_managed_offloading;
};
/* iWARP driver tunables */
struct iw_tunables {

View file

@ -35,6 +35,7 @@
#include <sys/types.h>
#include <net/ethernet.h>
#include <net/bpf.h>
/*
* Ioctl commands specific to this driver.
@ -344,6 +345,44 @@ struct t4_cudbg_dump {
uint8_t *data;
};
enum {
OPEN_TYPE_LISTEN = 'L',
OPEN_TYPE_ACTIVE = 'A',
OPEN_TYPE_PASSIVE = 'P',
OPEN_TYPE_DONTCARE = 'D',
};
struct offload_settings {
int8_t offload;
int8_t rx_coalesce;
int8_t cong_algo;
int8_t sched_class;
int8_t tstamp;
int8_t sack;
int8_t nagle;
int8_t ecn;
int8_t ddp;
int8_t tls;
int16_t txq;
int16_t rxq;
int16_t mss;
};
struct offload_rule {
char open_type;
struct offload_settings settings;
struct bpf_program bpf_prog; /* compiled program/filter */
};
/*
* An offload policy consists of a set of rules matched in sequence. The
* settings of the first rule that matches are applied to that connection.
*/
struct t4_offload_policy {
uint32_t nrules;
struct offload_rule *rule;
};
#define CHELSIO_T4_GETREG _IOWR('f', T4_GETREG, struct t4_reg)
#define CHELSIO_T4_SETREG _IOW('f', T4_SETREG, struct t4_reg)
#define CHELSIO_T4_REGDUMP _IOWR('f', T4_REGDUMP, struct t4_regdump)
@ -368,4 +407,5 @@ struct t4_cudbg_dump {
#define CHELSIO_T4_LOAD_BOOT _IOW('f', T4_LOAD_BOOT, struct t4_bootrom)
#define CHELSIO_T4_LOAD_BOOTCFG _IOW('f', T4_LOAD_BOOTCFG, struct t4_data)
#define CHELSIO_T4_CUDBG_DUMP _IOWR('f', T4_CUDBG_DUMP, struct t4_cudbg_dump)
#define CHELSIO_T4_SET_OFLD_POLICY _IOW('f', T4_SET_OFLD_POLICY, struct t4_offload_policy)
#endif

View file

@ -470,6 +470,14 @@ static int pcie_relaxed_ordering = -1;
TUNABLE_INT("hw.cxgbe.pcie_relaxed_ordering", &pcie_relaxed_ordering);
#ifdef TCP_OFFLOAD
/*
* TOE tunables.
*/
static int t4_cop_managed_offloading = 0;
TUNABLE_INT("hw.cxgbe.cop_managed_offloading", &t4_cop_managed_offloading);
#endif
/* Functions used by VIs to obtain unique MAC addresses for each VI. */
static int vi_mac_funcs[] = {
FW_VI_FUNC_ETH,
@ -617,6 +625,8 @@ static int load_cfg(struct adapter *, struct t4_data *);
static int load_boot(struct adapter *, struct t4_bootrom *);
static int load_bootcfg(struct adapter *, struct t4_data *);
static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *);
static void free_offload_policy(struct t4_offload_policy *);
static int set_offload_policy(struct adapter *, struct t4_offload_policy *);
static int read_card_mem(struct adapter *, int, struct t4_mem_range *);
static int read_i2c(struct adapter *, struct t4_i2c_data *);
#ifdef TCP_OFFLOAD
@ -897,6 +907,9 @@ t4_attach(device_t dev)
mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF);
sc->policy = NULL;
rw_init(&sc->policy_lock, "connection offload policy");
rc = t4_map_bars_0_and_4(sc);
if (rc != 0)
goto done; /* error message displayed already */
@ -1405,6 +1418,14 @@ t4_detach_common(device_t dev)
if (mtx_initialized(&sc->reg_lock))
mtx_destroy(&sc->reg_lock);
if (rw_initialized(&sc->policy_lock)) {
rw_destroy(&sc->policy_lock);
#ifdef TCP_OFFLOAD
if (sc->policy != NULL)
free_offload_policy(sc->policy);
#endif
}
for (i = 0; i < NUM_MEMWIN; i++) {
struct memwin *mw = &sc->memwin[i];
@ -5440,6 +5461,12 @@ t4_sysctls(struct adapter *sc)
CTLFLAG_RW, &sc->tt.tx_zcopy, 0,
"Enable zero-copy aio_write(2)");
sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading;
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"cop_managed_offloading", CTLFLAG_RW,
&sc->tt.cop_managed_offloading, 0,
"COP (Connection Offload Policy) controls all TOE offload");
SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A",
"TP timer tick (us)");
@ -9385,6 +9412,113 @@ cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump)
return (rc);
}
static void
free_offload_policy(struct t4_offload_policy *op)
{
struct offload_rule *r;
int i;
if (op == NULL)
return;
r = &op->rule[0];
for (i = 0; i < op->nrules; i++, r++) {
free(r->bpf_prog.bf_insns, M_CXGBE);
}
free(op->rule, M_CXGBE);
free(op, M_CXGBE);
}
static int
set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop)
{
int i, rc, len;
struct t4_offload_policy *op, *old;
struct bpf_program *bf;
const struct offload_settings *s;
struct offload_rule *r;
void *u;
if (!is_offload(sc))
return (ENODEV);
if (uop->nrules == 0) {
/* Delete installed policies. */
op = NULL;
goto set_policy;
} if (uop->nrules > 256) { /* arbitrary */
return (E2BIG);
}
/* Copy userspace offload policy to kernel */
op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK);
op->nrules = uop->nrules;
len = op->nrules * sizeof(struct offload_rule);
op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
rc = copyin(uop->rule, op->rule, len);
if (rc) {
free(op->rule, M_CXGBE);
free(op, M_CXGBE);
return (rc);
}
r = &op->rule[0];
for (i = 0; i < op->nrules; i++, r++) {
/* Validate open_type */
if (r->open_type != OPEN_TYPE_LISTEN &&
r->open_type != OPEN_TYPE_ACTIVE &&
r->open_type != OPEN_TYPE_PASSIVE &&
r->open_type != OPEN_TYPE_DONTCARE) {
error:
/*
* Rules 0 to i have malloc'd filters that need to be
* freed. Rules i+1 to nrules have userspace pointers
* and should be left alone.
*/
op->nrules = i;
free_offload_policy(op);
return (rc);
}
/* Validate settings */
s = &r->settings;
if ((s->offload != 0 && s->offload != 1) ||
s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED ||
s->sched_class < -1 ||
s->sched_class >= sc->chip_params->nsched_cls) {
rc = EINVAL;
goto error;
}
bf = &r->bpf_prog;
u = bf->bf_insns; /* userspace ptr */
bf->bf_insns = NULL;
if (bf->bf_len == 0) {
/* legal, matches everything */
continue;
}
len = bf->bf_len * sizeof(*bf->bf_insns);
bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK);
rc = copyin(u, bf->bf_insns, len);
if (rc != 0)
goto error;
if (!bpf_validate(bf->bf_insns, bf->bf_len)) {
rc = EINVAL;
goto error;
}
}
set_policy:
rw_wlock(&sc->policy_lock);
old = sc->policy;
sc->policy = op;
rw_wunlock(&sc->policy_lock);
free_offload_policy(old);
return (0);
}
#define MAX_READ_BUF_SIZE (128 * 1024)
static int
read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr)
@ -9743,6 +9877,9 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag,
case CHELSIO_T4_CUDBG_DUMP:
rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data);
break;
case CHELSIO_T4_SET_OFLD_POLICY:
rc = set_offload_policy(sc, (struct t4_offload_policy *)data);
break;
default:
rc = ENOTTY;
}

View file

@ -963,8 +963,10 @@ mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
#ifdef TCP_OFFLOAD
if (toe) {
payload = sc->tt.rx_coalesce ?
G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2));
/* Note that COP can set rx_coalesce on/off per connection. */
payload = max(mtu, rxcs);
} else {
#endif
/* large enough even when hw VLAN extraction is disabled */

View file

@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_types.h>
@ -55,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>
#include "common/common.h"
#include "common/t4_msg.h"
@ -233,47 +235,85 @@ do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
* Options2 for active open.
*/
static uint32_t
calc_opt2a(struct socket *so, struct toepcb *toep)
calc_opt2a(struct socket *so, struct toepcb *toep,
const struct offload_settings *s)
{
struct tcpcb *tp = so_sototcpcb(so);
struct port_info *pi = toep->vi->pi;
struct adapter *sc = pi->adapter;
uint32_t opt2;
uint32_t opt2 = 0;
opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
/*
* rx flow control, rx coalesce, congestion control, and tx pace are all
* explicitly set by the driver. On T5+ the ISS is also set by the
* driver to the value picked by the kernel.
*/
if (is_t4(sc)) {
opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
} else {
opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */
opt2 |= F_T5_ISS; /* ISS provided in CPL */
}
if (tp->t_flags & TF_SACK_PERMIT)
if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
opt2 |= F_SACK_EN;
if (tp->t_flags & TF_REQ_TSTMP)
if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
opt2 |= F_TSTAMPS_EN;
if (tp->t_flags & TF_REQ_SCALE)
opt2 |= F_WND_SCALE_EN;
if (V_tcp_do_ecn)
if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
opt2 |= F_CCTRL_ECN;
/* RX_COALESCE is always a valid value (M_RX_COALESCE). */
if (is_t4(sc))
opt2 |= F_RX_COALESCE_VALID;
/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
/* These defaults are subject to ULP specific fixups later. */
opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
opt2 |= V_PACE(0);
if (s->cong_algo >= 0)
opt2 |= V_CONG_CNTRL(s->cong_algo);
else if (sc->tt.cong_algorithm >= 0)
opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
else {
opt2 |= F_T5_OPT_2_VALID;
opt2 |= F_T5_ISS;
struct cc_algo *cc = CC_ALGO(tp);
if (strcasecmp(cc->name, "reno") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
else if (strcasecmp(cc->name, "tahoe") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
if (strcasecmp(cc->name, "newreno") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
if (strcasecmp(cc->name, "highspeed") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
else {
/*
* Use newreno in case the algorithm selected by the
* host stack is not supported by the hardware.
*/
opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
}
}
if (sc->tt.rx_coalesce)
if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
opt2 |= V_RX_COALESCE(M_RX_COALESCE);
if (sc->tt.cong_algorithm != -1)
opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
/* Note that ofld_rxq is already set according to s->rxq. */
opt2 |= F_RSS_QUEUE_VALID;
opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
#ifdef USE_DDP_RX_FLOW_CONTROL
if (toep->ulp_mode == ULP_MODE_TCPDDP)
opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
opt2 |= F_RX_FC_DDP;
#endif
if (toep->ulp_mode == ULP_MODE_TLS) {
opt2 |= F_RX_FC_VALID;
opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
opt2 |= F_RX_FC_DISABLE;
}
@ -348,10 +388,12 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
struct wrqe *wr = NULL;
struct ifnet *rt_ifp = rt->rt_ifp;
struct vi_info *vi;
int mtu_idx, rscale, qid_atid, rc, isipv6;
int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid;
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
int reason;
struct offload_settings settings;
uint16_t vid = 0xffff;
INP_WLOCK_ASSERT(inp);
KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
@ -363,12 +405,30 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
struct ifnet *ifp = VLAN_COOKIE(rt_ifp);
vi = ifp->if_softc;
VLAN_TAG(ifp, &vid);
} else if (rt_ifp->if_type == IFT_IEEE8023ADLAG)
DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */
else
DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP);
toep = alloc_toepcb(vi, -1, -1, M_NOWAIT | M_ZERO);
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, vid, inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload)
DONT_OFFLOAD_ACTIVE_OPEN(EPERM);
if (settings.txq >= 0 && settings.txq < vi->nofldtxq)
txqid = settings.txq;
else
txqid = arc4random() % vi->nofldtxq;
txqid += vi->first_ofld_txq;
if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq)
rxqid = settings.rxq;
else
rxqid = arc4random() % vi->nofldrxq;
rxqid += vi->first_ofld_rxq;
toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO);
if (toep == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
@ -387,7 +447,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
toep->vnet = so->so_vnet;
set_ulp_mode(toep, select_ulp_mode(so, sc));
set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
SOCKBUF_LOCK(&so->so_rcv);
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
@ -402,7 +462,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
rscale = tp->request_r_scale = select_rcv_wscale();
else
rscale = 0;
mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0);
mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings);
qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | toep->tid;
if (isipv6) {
@ -443,8 +503,8 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
toep->rx_credits, toep->ulp_mode);
cpl->opt2 = calc_opt2a(so, toep);
toep->rx_credits, toep->ulp_mode, &settings);
cpl->opt2 = calc_opt2a(so, toep, &settings);
} else {
struct cpl_act_open_req *cpl = wrtod(wr);
struct cpl_t5_act_open_req *cpl5 = (void *)cpl;
@ -472,8 +532,8 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
&cpl->peer_ip, &cpl->peer_port);
cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
toep->rx_credits, toep->ulp_mode);
cpl->opt2 = calc_opt2a(so, toep);
toep->rx_credits, toep->ulp_mode, &settings);
cpl->opt2 = calc_opt2a(so, toep, &settings);
}
CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__,

View file

@ -121,6 +121,11 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
nparams++;
if (toep->tls.fcplenmax != 0)
nparams++;
if (toep->tc_idx != -1) {
MPASS(toep->tc_idx >= 0 &&
toep->tc_idx < sc->chip_params->nsched_cls);
nparams++;
}
flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
@ -172,6 +177,8 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp)
FLOWC_PARAM(ULP_MODE, toep->ulp_mode);
if (toep->tls.fcplenmax != 0)
FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
if (toep->tc_idx != -1)
FLOWC_PARAM(SCHEDCLASS, toep->tc_idx);
#undef FLOWC_PARAM
KASSERT(paramidx == nparams, ("nparams mismatch"));
@ -333,19 +340,19 @@ assign_rxopt(struct tcpcb *tp, unsigned int opt)
n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
else
n = sizeof(struct ip) + sizeof(struct tcphdr);
if (V_tcp_do_rfc1323)
n += TCPOLEN_TSTAMP_APPA;
tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;
CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid,
G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]);
if (G_TCPOPT_TSTAMP(opt)) {
tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */
tp->ts_recent = 0; /* hmmm */
tp->ts_recent_age = tcp_ts_getticks();
tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
}
CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__,
toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)],
tp->t_maxseg);
if (G_TCPOPT_SACK(opt))
tp->t_flags |= TF_SACK_PERMIT; /* should already be set */
else

View file

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/fnv_hash.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_types.h>
@ -62,6 +63,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_var.h>
#include <netinet/toecore.h>
#include <netinet/cc/cc.h>
#include "common/common.h"
#include "common/t4_msg.h"
@ -84,7 +86,8 @@ static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *);
static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *,
struct offload_settings *);
static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
static void send_reset_synqe(struct toedev *, struct synq_entry *);
@ -513,9 +516,17 @@ t4_listen_start(struct toedev *tod, struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct listen_ctx *lctx;
int i, rc, v;
struct offload_settings settings;
INP_WLOCK_ASSERT(inp);
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff,
inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload)
return (0);
/* Don't start a hardware listener for any loopback address. */
if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
return (0);
@ -948,12 +959,22 @@ t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
}
static inline void
save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi)
save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi,
struct offload_settings *s)
{
uint32_t txqid, rxqid;
txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
if (s->txq >= 0 && s->txq < vi->nofldtxq)
txqid = s->txq;
else
txqid = arc4random() % vi->nofldtxq;
txqid += vi->first_ofld_txq;
if (s->rxq >= 0 && s->rxq < vi->nofldrxq)
rxqid = s->rxq;
else
rxqid = arc4random() % vi->nofldrxq;
rxqid += vi->first_ofld_rxq;
m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
}
@ -1019,50 +1040,88 @@ t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
*/
static uint32_t
calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode)
const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode,
struct cc_algo *cc, const struct offload_settings *s)
{
struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
uint32_t opt2;
uint32_t opt2 = 0;
opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) |
F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
if (V_tcp_do_rfc1323) {
if (tcpopt->tstamp)
opt2 |= F_TSTAMPS_EN;
if (tcpopt->sack)
opt2 |= F_SACK_EN;
if (tcpopt->wsf <= 14)
opt2 |= F_WND_SCALE_EN;
/*
* rx flow control, rx coalesce, congestion control, and tx pace are all
* explicitly set by the driver. On T5+ the ISS is also set by the
* driver to the value picked by the kernel.
*/
if (is_t4(sc)) {
opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
} else {
opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */
opt2 |= F_T5_ISS; /* ISS provided in CPL */
}
if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323)))
opt2 |= F_SACK_EN;
if (tcpopt->tstamp &&
(s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
opt2 |= F_TSTAMPS_EN;
if (tcpopt->wsf < 15 && V_tcp_do_rfc1323)
opt2 |= F_WND_SCALE_EN;
if (th->th_flags & (TH_ECE | TH_CWR) &&
(s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
opt2 |= F_CCTRL_ECN;
/* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */
if (is_t4(sc))
opt2 |= F_RX_COALESCE_VALID;
/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
/* These defaults are subject to ULP specific fixups later. */
opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
opt2 |= V_PACE(0);
if (s->cong_algo >= 0)
opt2 |= V_CONG_CNTRL(s->cong_algo);
else if (sc->tt.cong_algorithm >= 0)
opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
else {
opt2 |= F_T5_OPT_2_VALID;
opt2 |= F_T5_ISS;
if (strcasecmp(cc->name, "reno") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
else if (strcasecmp(cc->name, "tahoe") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
if (strcasecmp(cc->name, "newreno") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
if (strcasecmp(cc->name, "highspeed") == 0)
opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
else {
/*
* Use newreno in case the algorithm selected by the
* host stack is not supported by the hardware.
*/
opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
}
}
if (sc->tt.rx_coalesce)
if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
opt2 |= V_RX_COALESCE(M_RX_COALESCE);
if (sc->tt.cong_algorithm != -1)
opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
/* Note that ofld_rxq is already set according to s->rxq. */
opt2 |= F_RSS_QUEUE_VALID;
opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id);
#ifdef USE_DDP_RX_FLOW_CONTROL
if (ulp_mode == ULP_MODE_TCPDDP)
opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
opt2 |= F_RX_FC_DDP;
#endif
if (ulp_mode == ULP_MODE_TLS) {
opt2 |= F_RX_FC_VALID;
opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
opt2 |= F_RX_FC_DISABLE;
}
return htobe32(opt2);
return (htobe32(opt2));
}
static void
@ -1199,6 +1258,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
#ifdef INVARIANTS
unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
#endif
struct offload_settings settings;
KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
("%s: unexpected opcode 0x%x", __func__, opcode));
@ -1334,15 +1394,23 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
REJECT_PASS_ACCEPT();
}
so = inp->inp_socket;
rw_rlock(&sc->policy_lock);
settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp);
rw_runlock(&sc->policy_lock);
if (!settings.offload) {
INP_WUNLOCK(inp);
free(wr, M_CXGBE);
REJECT_PASS_ACCEPT();
}
mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
wnd = min(wnd, MAX_RCV_WND);
rx_credits = min(wnd >> 10, M_RCV_BUFSIZ);
save_qids_in_mbuf(m, vi);
save_qids_in_mbuf(m, vi, &settings);
get_qids_from_mbuf(m, NULL, &rxqid);
if (is_t4(sc))
@ -1352,7 +1420,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
}
ulp_mode = select_ulp_mode(so, sc);
ulp_mode = select_ulp_mode(so, sc, &settings);
switch (ulp_mode) {
case ULP_MODE_TCPDDP:
synqe->flags |= TPF_SYNQE_TCPDDP;
@ -1361,8 +1429,10 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
synqe->flags |= TPF_SYNQE_TLS;
break;
}
rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode);
rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode);
rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode,
&settings);
rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode,
CC_ALGO(intotcpcb(inp)), &settings);
synqe->tid = tid;
synqe->lctx = lctx;

View file

@ -51,6 +51,8 @@ __FBSDID("$FreeBSD$");
#include <sys/taskqueue.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
@ -137,15 +139,11 @@ alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags)
txsd_total = tx_credits /
howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
if (txqid < 0)
txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq;
KASSERT(txqid >= vi->first_ofld_txq &&
txqid < vi->first_ofld_txq + vi->nofldtxq,
("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi,
vi->first_ofld_txq, vi->nofldtxq));
if (rxqid < 0)
rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq;
KASSERT(rxqid >= vi->first_ofld_rxq &&
rxqid < vi->first_ofld_rxq + vi->nofldrxq,
("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi,
@ -569,27 +567,28 @@ queue_tid_release(struct adapter *sc, int tid)
}
/*
* What mtu_idx to use, given a 4-tuple and/or an MSS cap
* What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt
* have the MSS that we should advertise in our SYN. Advertised MSS doesn't
* account for any TCP options so the effective MSS (only payload, no headers or
* options) could be different. We fill up tp->t_maxseg with the effective MSS
* at the end of the 3-way handshake.
*/
int
find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss)
find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
struct offload_settings *s)
{
unsigned short *mtus = &sc->params.mtus[0];
int i, mss, n;
int i, mss, mtu;
KASSERT(inc != NULL || pmss > 0,
("%s: at least one of inc/pmss must be specified", __func__));
mss = inc ? tcp_mssopt(inc) : pmss;
if (pmss > 0 && mss > pmss)
mss = pmss;
MPASS(inc != NULL);
mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
if (inc->inc_flags & INC_ISIPV6)
n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
else
n = sizeof(struct ip) + sizeof(struct tcphdr);
mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++)
for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
continue;
return (i);
@ -632,33 +631,32 @@ select_rcv_wscale(void)
*/
uint64_t
calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e,
int mtu_idx, int rscale, int rx_credits, int ulp_mode)
int mtu_idx, int rscale, int rx_credits, int ulp_mode,
struct offload_settings *s)
{
int keepalive;
uint64_t opt0;
MPASS(so != NULL);
MPASS(vi != NULL);
KASSERT(rx_credits <= M_RCV_BUFSIZ,
("%s: rcv_bufsiz too high", __func__));
opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) |
V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits);
V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits) |
V_L2T_IDX(e->idx) | V_SMAC_SEL(vi->smt_idx) |
V_TX_CHAN(vi->pi->tx_chan);
if (so != NULL) {
keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE;
opt0 |= V_KEEP_ALIVE(keepalive != 0);
if (s->nagle < 0) {
struct inpcb *inp = sotoinpcb(so);
struct tcpcb *tp = intotcpcb(inp);
int keepalive = tcp_always_keepalive ||
so_options_get(so) & SO_KEEPALIVE;
opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0);
opt0 |= V_KEEP_ALIVE(keepalive != 0);
}
if (e != NULL)
opt0 |= V_L2T_IDX(e->idx);
if (vi != NULL) {
opt0 |= V_SMAC_SEL(vi->smt_idx);
opt0 |= V_TX_CHAN(vi->pi->tx_chan);
}
} else
opt0 |= V_NAGLE(s->nagle != 0);
return htobe64(opt0);
}
@ -720,12 +718,15 @@ is_tls_sock(struct socket *so, struct adapter *sc)
}
int
select_ulp_mode(struct socket *so, struct adapter *sc)
select_ulp_mode(struct socket *so, struct adapter *sc,
struct offload_settings *s)
{
if (can_tls_offload(sc) && is_tls_sock(so, sc))
if (can_tls_offload(sc) &&
(s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc))))
return (ULP_MODE_TLS);
else if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)
else if (s->ddp > 0 ||
(s->ddp < 0 && sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0))
return (ULP_MODE_TCPDDP);
else
return (ULP_MODE_NONE);
@ -1093,6 +1094,181 @@ free_tom_data(struct adapter *sc, struct tom_data *td)
free(td, M_CXGBE);
}
static char *
prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
int *buflen)
{
char *pkt;
struct tcphdr *th;
int ipv6, len;
const int maxlen =
max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
sizeof(struct tcphdr);
MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
if (pkt == NULL)
return (NULL);
ipv6 = inp->inp_vflag & INP_IPV6;
len = 0;
if (vtag == 0xffff) {
struct ether_header *eh = (void *)pkt;
if (ipv6)
eh->ether_type = htons(ETHERTYPE_IPV6);
else
eh->ether_type = htons(ETHERTYPE_IP);
len += sizeof(*eh);
} else {
struct ether_vlan_header *evh = (void *)pkt;
evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
evh->evl_tag = htons(vtag);
if (ipv6)
evh->evl_proto = htons(ETHERTYPE_IPV6);
else
evh->evl_proto = htons(ETHERTYPE_IP);
len += sizeof(*evh);
}
if (ipv6) {
struct ip6_hdr *ip6 = (void *)&pkt[len];
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_nxt = IPPROTO_TCP;
if (open_type == OPEN_TYPE_ACTIVE) {
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = inp->in6p_faddr;
} else if (open_type == OPEN_TYPE_LISTEN) {
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = ip6->ip6_src;
}
len += sizeof(*ip6);
} else {
struct ip *ip = (void *)&pkt[len];
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(*ip) >> 2;
ip->ip_tos = inp->inp_ip_tos;
ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
ip->ip_ttl = inp->inp_ip_ttl;
ip->ip_p = IPPROTO_TCP;
if (open_type == OPEN_TYPE_ACTIVE) {
ip->ip_src = inp->inp_laddr;
ip->ip_dst = inp->inp_faddr;
} else if (open_type == OPEN_TYPE_LISTEN) {
ip->ip_src = inp->inp_laddr;
ip->ip_dst = ip->ip_src;
}
len += sizeof(*ip);
}
th = (void *)&pkt[len];
if (open_type == OPEN_TYPE_ACTIVE) {
th->th_sport = inp->inp_lport; /* network byte order already */
th->th_dport = inp->inp_fport; /* ditto */
} else if (open_type == OPEN_TYPE_LISTEN) {
th->th_sport = inp->inp_lport; /* network byte order already */
th->th_dport = th->th_sport;
}
len += sizeof(th);
*pktlen = *buflen = len;
return (pkt);
}
const struct offload_settings *
lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
uint16_t vtag, struct inpcb *inp)
{
const struct t4_offload_policy *op;
char *pkt;
struct offload_rule *r;
int i, matched, pktlen, buflen;
static const struct offload_settings allow_offloading_settings = {
.offload = 1,
.rx_coalesce = -1,
.cong_algo = -1,
.sched_class = -1,
.tstamp = -1,
.sack = -1,
.nagle = -1,
.ecn = -1,
.ddp = -1,
.tls = -1,
.txq = -1,
.rxq = -1,
.mss = -1,
};
static const struct offload_settings disallow_offloading_settings = {
.offload = 0,
/* rest is irrelevant when offload is off. */
};
rw_assert(&sc->policy_lock, RA_LOCKED);
/*
* If there's no Connection Offloading Policy attached to the device
* then we need to return a default static policy. If
* "cop_managed_offloading" is true, then we need to disallow
* offloading until a COP is attached to the device. Otherwise we
* allow offloading ...
*/
op = sc->policy;
if (op == NULL) {
if (sc->tt.cop_managed_offloading)
return (&disallow_offloading_settings);
else
return (&allow_offloading_settings);
}
switch (open_type) {
case OPEN_TYPE_ACTIVE:
case OPEN_TYPE_LISTEN:
pkt = prepare_pkt(open_type, 0xffff, inp, &pktlen, &buflen);
break;
case OPEN_TYPE_PASSIVE:
MPASS(m != NULL);
pkt = mtod(m, char *);
MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
pkt += sizeof(struct cpl_pass_accept_req);
pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
break;
default:
MPASS(0);
return (&disallow_offloading_settings);
}
if (pkt == NULL || pktlen == 0 || buflen == 0)
return (&disallow_offloading_settings);
r = &op->rule[0];
for (i = 0; i < op->nrules; i++, r++) {
if (r->open_type != open_type &&
r->open_type != OPEN_TYPE_DONTCARE) {
continue;
}
matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
if (matched)
break;
}
if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
free(pkt, M_CXGBE);
return (matched ? &r->settings : &disallow_offloading_settings);
}
static void
reclaim_wr_resources(void *arg, int count)
{

View file

@ -87,6 +87,7 @@ enum {
};
struct sockopt;
struct offload_settings;
struct ofld_tx_sdesc {
uint32_t plen; /* payload length */
@ -333,13 +334,15 @@ void *lookup_tid(struct adapter *, int);
void update_tid(struct adapter *, int, void *);
void remove_tid(struct adapter *, int, int);
void release_tid(struct adapter *, int, struct sge_wrq *);
int find_best_mtu_idx(struct adapter *, struct in_conninfo *, int);
int find_best_mtu_idx(struct adapter *, struct in_conninfo *,
struct offload_settings *);
u_long select_rcv_wnd(struct socket *);
int select_rcv_wscale(void);
uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *,
int, int, int, int);
int, int, int, int, struct offload_settings *);
uint64_t select_ntuple(struct vi_info *, struct l2t_entry *);
int select_ulp_mode(struct socket *, struct adapter *);
int select_ulp_mode(struct socket *, struct adapter *,
struct offload_settings *);
void set_ulp_mode(struct toepcb *, int);
int negative_advice(int);
struct clip_entry *hold_lip(struct tom_data *, struct in6_addr *,
@ -416,6 +419,8 @@ void handle_ddp_close(struct toepcb *, struct tcpcb *, uint32_t);
void handle_ddp_indicate(struct toepcb *);
void handle_ddp_tcb_rpl(struct toepcb *, const struct cpl_set_tcb_rpl *);
void insert_ddp_data(struct toepcb *, uint32_t);
const struct offload_settings *lookup_offload_policy(struct adapter *, int,
struct mbuf *, uint16_t, struct inpcb *);
/* t4_tls.c */
bool can_tls_offload(struct adapter *);

View file

@ -8,6 +8,7 @@ SRCS+= tcbinfot4.c tcbshowt4.c
SRCS+= tcbinfot5.c tcbshowt5.c
SRCS+= tcbinfot6.c tcbshowt6.c
CFLAGS+= -I${SRCTOP}/sys/dev/cxgbe -I${SRCTOP}/sys -I.
LIBADD= pcap
WARNS?= 2
.include <bsd.prog.mk>

View file

@ -31,7 +31,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd March 6, 2017
.Dd April 13, 2018
.Dt CXGBETOOL 8
.Os
.Sh NAME
@ -64,6 +64,10 @@
.It
.Nm Ar nexus Cm memdump Ar addr len
.It
.Nm Ar nexus Cm policy Ar cop.txt
.It
.Nm Ar nexus Cm policy clear
.It
.Nm Ar nexus Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val
.It
.Nm Ar nexus Cm regdump Op Ar register-block ...
@ -378,6 +382,144 @@ bytes of data of the card's memory starting at
.Ar addr Ns .
The card's memory map is available in
.Va dev.t4nex.%d.misc.meminfo Ns .
.It Cm policy Ar cop.txt
Install the Connection Offload Policy (COP) in
.Ar cop.txt Ns .
A COP offers fine-grained control over which connections get offloaded and with
what parameters.
Set
.Cm hw.cxgbe.cop_managed_offloading="1"
in loader.conf to ensure that t4_tom will not offload any connection before a
COP is installed.
Note that t4_tom must be loaded and operational (IFCAP_TOE enabled) as always
for any kind of offload based on the hardware TOE.
.Bl -column -offset indent "COP installed" "cop_managed_offloading" "Behavior"
.It Sy COP installed Ta Sy cop_managed_offloading Ta Sy Behavior
.It NO Ta 0 Ta offload all [Default]
.It NO Ta 1 Ta no offload
.It YES Ta Don't Care Ta Rule based offload
.El
.Pp
The policy file consists of empty lines, comments (lines begining with #) and
any number of rules.
Rules are applied in the order they appear in the file and processing stops at
the first match.
There is an implicit rule that disables offload for connections that do not
match anything in the policy.
.Pp
Each rule consists of a filter part, which determines what connections the
rule applies to, and a settings part, which determines whether whether matching
connections will be offloaded and, if so, with what settings.
The general form of a rule is
.Bl -ohang -offset indent
.It Cm \&[ Ar socket-type Cm \&] Ar pcap-filter Cm => Ar settings
.Pp
.Ar socket-type
is one of the following.
.Bl -tag -width "X" -compact
.It Sy A
Active open.
Connection is being opened by this host.
.It Sy P
Passive open.
Connection was requested by a peer.
.It Sy L
Listen called on a socket.
Disabling offload in such a rule will prevent a hardware listener from being started.
.It Sy D
Don't care.
Matches all of the above.
.El
.Pp
.Ar pcap-filter
is an expression that follows the
.Xr pcap-filter 7
syntax, or it is the keyword
.Cm all
that matches everything.
.Pp
.Ar settings
determine whether connections matching
.Ar socket-type
and
.Ar pcap-filter
are offloaded and optionally sets some per-connection properties if they are.
A combination of the following is allowed.
.Bl -tag -width "timestamp" -compact
.It Cm offload
Connection should be offloaded.
Use
.Cm !offload
or
.Cm not offload
to disable offload instead.
.It Cm coalesce
Enable rx payload coalescing.
Negate to disable.
.It Cm timestamp
Enable TCP timestamp option.
Negate to disable.
.It Cm sack
Enable TCP Selective Acknowledgements (SACK).
Negate to disable.
.It Cm nagle
Enable Nagle's algorithm.
Negate to disable.
.It Cm ecn
Enable Explicit Congestion Notification (ECN).
Negate to disable.
.It Cm ddp
Use Direct Data Placement (zero copy receive) and zero copy transmit on the
connection to service AIO requests on the socket.
Negate to disable.
.It Cm tls
Set ULP mode to ULP_MODE_TLS.
.It Cm cong Ar algo
Use the specified congestion control algorithm.
.Ar algo
must be one of
.Cm reno Ns , Cm tahoe Ns , Cm newreno Ns , or Cm highspeed Ns .
.It Cm class Ar sc
Bind the connection to the specified tx scheduling class.
Valid range is 0 to 14 (for T4) and 0 to 15 (T5 onwards).
.It Cm rxq Ar qid
Use the specified offload rx queue.
.Ar qid
should be between 0 and nofldrxq for the ifnet.
.It Cm txq Ar qnum
Use the specified offload tx queue.
.Ar qid
should be between 0 and nofldtxq for the ifnet.
.It Cm bind Ar qnum
Shorthand for
.Cm rxq Ar qnum Cm txq Ar qnum Ns .
Use only when nofldrxq is the same as nofldtxq.
.It Cm mss Ar val
Set the advertised TCP MSS in the SYN for this connection to
.Ar val
(in bytes).
The hardware MTU table must already have an entry that is suitable for the MSS.
.El
.Pp
.It Example of a COP.
Note that hardware listener for port 22 will be IPv4 only because the rule
before it will prevent any IPv6 servers other than the first two. Also note
that outgoing connections to 192.168/16 are the only outgoing connections that
will get offloaded.
.Bd -literal
[L] port 80 => offload
[L] port 443 => offload
[L] ip6 => !offload
[L] port 22 => offload
[P] dst port 80 => offload cong highspeed !sack !ecn
[P] dst port 443 => offload tls
[A] dst net 192.168/16 => offload
[A] all => !offload
[D] port 22 => offload !nagle
.Ed
.El
.It Cm policy clear
Remove the Connection Offload Policy (COP) if one is in use.
.It Bro Cm reg | reg64 Brc Ar addr Ns Op Ar =val
.It Cm regdump Op Ar register-block ...
Display contents of device registers.

View file

@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pcap.h>
#include "t4_ioctl.h"
#include "tcb_common.h"
@ -106,6 +107,8 @@ usage(FILE *fp)
"\tloadfw <fw-image.bin> install firmware\n"
"\tmemdump <addr> <len> dump a memory range\n"
"\tmodinfo <port> [raw] optics/cable information\n"
"\tpolicy <policy.txt> install offload policy\n"
"\tpolicy clear remove offload policy\n"
"\treg <address>[=<val>] read/write register\n"
"\treg64 <address>[=<val>] read/write 64 bit register\n"
"\tregdump [<module>] ... dump registers\n"
@ -2889,6 +2892,453 @@ sched_queue(int argc, const char *argv[])
return doit(CHELSIO_T4_SCHED_QUEUE, &op);
}
static int
parse_offload_settings_word(const char *s, char **pnext, const char *ws,
int *pneg, struct offload_settings *os)
{
while (*s == '!') {
(*pneg)++;
s++;
}
if (!strcmp(s, "not")) {
(*pneg)++;
return (0);
}
if (!strcmp(s, "offload")) {
os->offload = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s , "coalesce")) {
os->rx_coalesce = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "timestamp") || !strcmp(s, "tstamp")) {
os->tstamp = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "sack")) {
os->sack = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "nagle")) {
os->nagle = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "ecn")) {
os->ecn = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "ddp")) {
os->ddp = (*pneg + 1) & 1;
*pneg = 0;
} else if (!strcmp(s, "tls")) {
os->tls = (*pneg + 1) & 1;
*pneg = 0;
} else {
char *param, *p;
long val;
/* Settings with additional parameter handled here. */
if (*pneg) {
warnx("\"%s\" is not a valid keyword, or it does not "
"support negation.", s);
return (EINVAL);
}
while ((param = strsep(pnext, ws)) != NULL) {
if (*param != '\0')
break;
}
if (param == NULL) {
warnx("\"%s\" is not a valid keyword, or it requires a "
"parameter that has not been provided.", s);
return (EINVAL);
}
if (!strcmp(s, "cong")) {
if (!strcmp(param, "reno"))
os->cong_algo = 0;
else if (!strcmp(param, "tahoe"))
os->cong_algo = 1;
else if (!strcmp(param, "newreno"))
os->cong_algo = 2;
else if (!strcmp(param, "highspeed"))
os->cong_algo = 3;
else {
warnx("unknown congestion algorithm \"%s\".", s);
return (EINVAL);
}
} else if (!strcmp(s, "class")) {
val = -1;
p = str_to_number(param, &val, NULL);
/* (nsched_cls - 1) is spelled 15 here. */
if (*p || val < 0 || val > 15) {
warnx("invalid scheduling class \"%s\". "
"\"class\" needs an integer value where "
"0 <= value <= 15", param);
return (EINVAL);
}
os->sched_class = val;
} else if (!strcmp(s, "bind") || !strcmp(s, "txq") ||
!strcmp(s, "rxq")) {
val = -1;
if (strcmp(param, "random")) {
p = str_to_number(param, &val, NULL);
if (*p || val < 0 || val > 0xffff) {
warnx("invalid queue specification "
"\"%s\". \"%s\" needs an integer"
" value, or \"random\".",
param, s);
return (EINVAL);
}
}
if (!strcmp(s, "bind")) {
os->txq = val;
os->rxq = val;
} else if (!strcmp(s, "txq")) {
os->txq = val;
} else if (!strcmp(s, "rxq")) {
os->rxq = val;
} else {
return (EDOOFUS);
}
} else if (!strcmp(s, "mss")) {
val = -1;
p = str_to_number(param, &val, NULL);
if (*p || val <= 0) {
warnx("invalid MSS specification \"%s\". "
"\"mss\" needs a positive integer value",
param);
return (EINVAL);
}
os->mss = val;
} else {
warnx("unknown settings keyword: \"%s\"", s);
return (EINVAL);
}
}
return (0);
}
static int
parse_offload_settings(const char *settings_ro, struct offload_settings *os)
{
const char *ws = " \f\n\r\v\t";
char *settings, *s, *next;
int rc, nsettings, neg;
static const struct offload_settings default_settings = {
.offload = 0, /* No settings imply !offload */
.rx_coalesce = -1,
.cong_algo = -1,
.sched_class = -1,
.tstamp = -1,
.sack = -1,
.nagle = -1,
.ecn = -1,
.ddp = -1,
.tls = -1,
.txq = -1,
.rxq = -1,
.mss = -1,
};
*os = default_settings;
next = settings = strdup(settings_ro);
if (settings == NULL) {
warn (NULL);
return (errno);
}
nsettings = 0;
rc = 0;
neg = 0;
while ((s = strsep(&next, ws)) != NULL) {
if (*s == '\0')
continue;
nsettings++;
rc = parse_offload_settings_word(s, &next, ws, &neg, os);
if (rc != 0)
goto done;
}
if (nsettings == 0) {
warnx("no settings provided");
rc = EINVAL;
goto done;
}
if (neg > 0) {
warnx("%d stray negation(s) at end of offload settings", neg);
rc = EINVAL;
goto done;
}
done:
free(settings);
return (rc);
}
static int
isempty_line(char *line, size_t llen)
{
/* skip leading whitespace */
while (isspace(*line)) {
line++;
llen--;
}
if (llen == 0 || *line == '#' || *line == '\n')
return (1);
return (0);
}
static int
special_offload_rule(char *str)
{
/* skip leading whitespaces */
while (isspace(*str))
str++;
/* check for special strings: "-", "all", "any" */
if (*str == '-') {
str++;
} else if (!strncmp(str, "all", 3) || !strncmp(str, "any", 3)) {
str += 3;
} else {
return (0);
}
/* skip trailing whitespaces */
while (isspace(*str))
str++;
return (*str == '\0');
}
/*
* A rule has 3 parts: an open-type, a match expression, and offload settings.
*
* [<open-type>] <expr> => <settings>
*/
static int
parse_offload_policy_line(size_t lno, char *line, size_t llen, pcap_t *pd,
struct offload_rule *r)
{
char *expr, *settings, *s;
bzero(r, sizeof(*r));
/* Skip leading whitespace. */
while (isspace(*line))
line++;
/* Trim trailing whitespace */
s = &line[llen - 1];
while (isspace(*s)) {
*s-- = '\0';
llen--;
}
/*
* First part of the rule: '[X]' where X = A/D/L/P
*/
if (*line++ != '[') {
warnx("missing \"[\" on line %zd", lno);
return (EINVAL);
}
switch (*line) {
case 'A':
case 'D':
case 'L':
case 'P':
r->open_type = *line;
break;
default:
warnx("invalid socket-type \"%c\" on line %zd.", *line, lno);
return (EINVAL);
}
line++;
if (*line++ != ']') {
warnx("missing \"]\" after \"[%c\" on line %zd",
r->open_type, lno);
return (EINVAL);
}
/* Skip whitespace. */
while (isspace(*line))
line++;
/*
* Rest of the rule: <expr> => <settings>
*/
expr = line;
s = strstr(line, "=>");
if (s == NULL)
return (EINVAL);
settings = s + 2;
while (isspace(*settings))
settings++;
*s = '\0';
/*
* <expr> is either a special name (all, any) or a pcap-filter(7).
* In case of a special name the bpf_prog stays all-zero.
*/
if (!special_offload_rule(expr)) {
if (pcap_compile(pd, &r->bpf_prog, expr, 1,
PCAP_NETMASK_UNKNOWN) < 0) {
warnx("failed to compile \"%s\" on line %zd: %s", expr,
lno, pcap_geterr(pd));
return (EINVAL);
}
}
/* settings to apply on a match. */
if (parse_offload_settings(settings, &r->settings) != 0) {
warnx("failed to parse offload settings \"%s\" on line %zd",
settings, lno);
pcap_freecode(&r->bpf_prog);
return (EINVAL);
}
return (0);
}
/*
* Note that op itself is not dynamically allocated.
*/
static void
free_offload_policy(struct t4_offload_policy *op)
{
int i;
for (i = 0; i < op->nrules; i++) {
/*
* pcap_freecode can cope with empty bpf_prog, which is the case
* for an rule that matches on 'any/all/-'.
*/
pcap_freecode(&op->rule[i].bpf_prog);
}
free(op->rule);
op->nrules = 0;
op->rule = NULL;
}
#define REALLOC_STRIDE 32
/*
* Fills up op->nrules and op->rule.
*/
static int
parse_offload_policy(const char *fname, struct t4_offload_policy *op)
{
FILE *fp;
char *line;
int lno, maxrules, rc;
size_t lcap, llen;
struct offload_rule *r;
pcap_t *pd;
fp = fopen(fname, "r");
if (fp == NULL) {
warn("Unable to open file \"%s\"", fname);
return (errno);
}
pd = pcap_open_dead(DLT_EN10MB, 128);
if (pd == NULL) {
warnx("Failed to open pcap device");
fclose(fp);
return (EIO);
}
rc = 0;
lno = 0;
lcap = 0;
maxrules = 0;
op->nrules = 0;
op->rule = NULL;
line = NULL;
while ((llen = getline(&line, &lcap, fp)) != -1) {
lno++;
/* Skip empty lines. */
if (isempty_line(line, llen))
continue;
if (op->nrules == maxrules) {
maxrules += REALLOC_STRIDE;
r = realloc(op->rule,
maxrules * sizeof(struct offload_rule));
if (r == NULL) {
warnx("failed to allocate memory for %d rules",
maxrules);
rc = ENOMEM;
goto done;
}
op->rule = r;
}
r = &op->rule[op->nrules];
rc = parse_offload_policy_line(lno, line, llen, pd, r);
if (rc != 0) {
warnx("Error parsing line %d of \"%s\"", lno, fname);
goto done;
}
op->nrules++;
}
free(line);
if (!feof(fp)) {
warn("Error while reading from file \"%s\" at line %d",
fname, lno);
rc = errno;
goto done;
}
if (op->nrules == 0) {
warnx("No valid rules found in \"%s\"", fname);
rc = EINVAL;
}
done:
pcap_close(pd);
fclose(fp);
if (rc != 0) {
free_offload_policy(op);
}
return (rc);
}
static int
load_offload_policy(int argc, const char *argv[])
{
int rc = 0;
const char *fname = argv[0];
struct t4_offload_policy op = {0};
if (argc != 1) {
warnx("incorrect number of arguments.");
return (EINVAL);
}
if (!strcmp(fname, "clear") || !strcmp(fname, "none")) {
/* op.nrules is 0 and that means clear policy */
return (doit(CHELSIO_T4_SET_OFLD_POLICY, &op));
}
rc = parse_offload_policy(fname, &op);
if (rc != 0) {
/* Error message displayed already */
return (EINVAL);
}
rc = doit(CHELSIO_T4_SET_OFLD_POLICY, &op);
free_offload_policy(&op);
return (rc);
}
static int
run_cmd(int argc, const char *argv[])
{
@ -2935,6 +3385,8 @@ run_cmd(int argc, const char *argv[])
rc = loadbootcfg(argc, argv);
else if (!strcmp(cmd, "dumpstate"))
rc = dumpstate(argc, argv);
else if (!strcmp(cmd, "policy"))
rc = load_offload_policy(argc, argv);
else {
rc = EINVAL;
warnx("invalid command \"%s\"", cmd);