From d519cedbad91f407a94ab14e1f53bf534bc0e6db Mon Sep 17 00:00:00 2001 From: Gleb Smirnoff Date: Fri, 22 Jan 2016 02:07:48 +0000 Subject: [PATCH] Provide new socket option TCP_CCALGOOPT, which stands for TCP congestion control algorithm options. The argument is variable length and is opaque to TCP, forwarded directly to the algorithm's ctl_output method. Provide new includes directory netinet/cc, where algorithm specific headers can be installed. The new API doesn't yet have any in tree consumers. The original code written by lstewart. Reviewed by: rrs, emax Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D711 --- etc/mtree/BSD.include.dist | 2 ++ include/Makefile | 1 + share/man/man4/mod_cc.4 | 14 ++++++++++++-- share/man/man4/tcp.4 | 7 ++++++- share/man/man9/mod_cc.9 | 20 ++++++++++++++++++-- sys/netinet/tcp.h | 1 + sys/netinet/tcp_cc.h | 3 +++ sys/netinet/tcp_usrreq.c | 28 +++++++++++++++++++++++++++- 8 files changed, 70 insertions(+), 6 deletions(-) diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index bb0b26dd3d08..88e80e66a479 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -270,6 +270,8 @@ .. .. netinet + cc + .. .. netinet6 .. diff --git a/include/Makefile b/include/Makefile index c75de6e3573d..8033de1cbc06 100644 --- a/include/Makefile +++ b/include/Makefile @@ -53,6 +53,7 @@ LSUBDIRS= cam/ata cam/scsi \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ netgraph/atm netgraph/netflow \ + netinet/cc \ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ diff --git a/share/man/man4/mod_cc.4 b/share/man/man4/mod_cc.4 index f5f44933f7e5..4712a3912bc0 100644 --- a/share/man/man4/mod_cc.4 +++ b/share/man/man4/mod_cc.4 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd January 12, 2015 +.Dd January 21, 2016 .Dt MOD_CC 4 .Os .Sh NAME @@ -49,7 +49,9 @@ using the facility. .Pp The default algorithm is NewReno, and all connections use the default unless -explicitly overridden using the TCP_CONGESTION socket option (see +explicitly overridden using the +.Dv TCP_CONGESTION +socket option (see .Xr tcp 4 for details). The default can be changed using a @@ -57,6 +59,14 @@ The default can be changed using a MIB variable detailed in the .Sx MIB Variables section below. +.Pp +Algorithm specific parameters can be set or queried using the +.Dv TCP_CCALGOOPT +socket option (see +.Xr tcp 4 +for details). +Callers must pass a pointer to an algorithm specific data, and specify +its size. .Sh MIB Variables The framework exposes the following variables in the .Va net.inet.tcp.cc diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 index 8c5887fdcfde..de993e7b06a5 100644 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd October 27, 2015 +.Dd January 21, 2016 .Dt TCP 4 .Os .Sh NAME @@ -137,6 +137,11 @@ send window size, receive window size, and bandwidth-controlled window space. +.It Dv TCP_CCALGOOPT +Set or query congestion control algorithm specific parameters. +See +.Xr mod_cc 4 +for details. .It Dv TCP_CONGESTION Select or query the congestion control algorithm that TCP will use for the connection. diff --git a/share/man/man9/mod_cc.9 b/share/man/man9/mod_cc.9 index f1cd5be08077..05205ed84551 100644 --- a/share/man/man9/mod_cc.9 +++ b/share/man/man9/mod_cc.9 @@ -31,7 +31,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 26, 2014 +.Dd January 21, 2016 .Dt MOD_CC 9 .Os .Sh NAME @@ -40,7 +40,8 @@ .Nm CCV .Nd Modular Congestion Control .Sh SYNOPSIS -.In netinet/cc.h +.In netinet/tcp.h +.In netinet/tcp_cc.h .In netinet/cc/cc_module.h .Fn DECLARE_CC_MODULE "ccname" "ccalgo" .Fn CCV "ccv" "what" @@ -74,6 +75,7 @@ struct cc_algo { void (*cong_signal) (struct cc_var *ccv, uint32_t type); void (*post_recovery) (struct cc_var *ccv); void (*after_idle) (struct cc_var *ccv); + int (*ctl_output)(struct cc_var *, struct sockopt *, void *); }; .Ed .Pp @@ -166,6 +168,20 @@ function is called when data transfer resumes after an idle period. It should be implemented to adjust state as required. .Pp The +.Va ctl_output +function is called when +.Xr getsockopt 2 +or +.Xr setsockopt 2 +is called on a +.Xr tcp 4 +socket with the +.Va struct sockopt +pointer forwarded unmodified from the TCP control, and a +.Va void * +pointer to algorithm specific argument. +.Pp +The .Fn DECLARE_CC_MODULE macro provides a convenient wrapper around the .Xr DECLARE_MODULE 9 diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index bfc8073fc996..470381040705 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -165,6 +165,7 @@ struct tcphdr { #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ diff --git a/sys/netinet/tcp_cc.h b/sys/netinet/tcp_cc.h index 4a2b0c80ac21..d90cd1945582 100644 --- a/sys/netinet/tcp_cc.h +++ b/sys/netinet/tcp_cc.h @@ -151,6 +151,9 @@ struct cc_algo { /* Called for an additional ECN processing apart from RFC3168. */ void (*ecnpkt_handler)(struct cc_var *ccv); + /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */ + int (*ctl_output)(struct cc_var *, struct sockopt *, void *); + STAILQ_ENTRY (cc_algo) entries; }; diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 29e92b2e14e1..4b3150bad82a 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1480,7 +1480,33 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp struct tcp_info ti; struct cc_algo *algo; char *buf; - + + /* + * For TCP_CCALGOOPT forward the control to CC module, for both + * SOPT_SET and SOPT_GET. + */ + switch (sopt->sopt_name) { + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + buf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); + error = sooptcopyin(sopt, buf, sopt->sopt_valsize, + sopt->sopt_valsize); + if (error) { + free(buf, M_TEMP); + return (error); + } + INP_WLOCK_RECHECK(inp); + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, buf); + else + error = ENOENT; + INP_WUNLOCK(inp); + if (error == 0 && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, buf, sopt->sopt_valsize); + free(buf, M_TEMP); + return (error); + } + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) {