Add the "TCP Blackbox Recorder" which we discussed at the developer

summits at BSDCan and BSDCam in 2017.

The TCP Blackbox Recorder allows you to capture events on a TCP connection
in a ring buffer. It stores metadata with the event. It optionally stores
the TCP header associated with an event (if the event is associated with a
packet) and also optionally stores information on the sockets.

It supports setting a log ID on a TCP connection and using this to correlate
multiple connections that share a common log ID.

You can log connections in different modes. If you are doing a coordinated
test with a particular connection, you may tell the system to put it in
mode 4 (continuous dump). Or, if you just want to monitor for errors, you
can put it in mode 1 (ring buffer) and dump all the ring buffers associated
with the connection ID when we receive an error signal for that connection
ID. You can set a default mode that will be applied to a particular ratio
of incoming connections. You can also manually set a mode using a socket
option.

This commit includes only basic probes. rrs@ has added quite an abundance
of probes in his TCP development work. He plans to commit those soon.

There are user-space programs which we plan to commit as ports. These read
the data from the log device and output pcapng files, and then let you
analyze the data (and metadata) in the pcapng files.

Reviewed by:	gnn (previous version)
Obtained from:	Netflix, Inc.
Relnotes:	yes
Differential Revision:	https://reviews.freebsd.org/D11085
This commit is contained in:
Jonathan T. Looney 2018-03-22 09:40:08 +00:00
parent bf8e3513bb
commit 2529f56ed3
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=331347
19 changed files with 3680 additions and 8 deletions

View file

@ -158,6 +158,8 @@
..
speaker
..
tcp_log
..
usb
..
vkbd

View file

@ -47,7 +47,7 @@ LSUBDIRS= cam/ata cam/mmc cam/nvme cam/scsi \
dev/hwpmc dev/hyperv \
dev/ic dev/iicbus dev/io dev/lmc dev/mfi dev/mmc dev/nvme \
dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \
dev/speaker dev/vkbd dev/wi \
dev/speaker dev/tcp_log dev/vkbd dev/wi \
fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \
fs/procfs fs/smbfs fs/udf fs/unionfs \
geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \

View file

@ -3161,6 +3161,7 @@ dev/syscons/star/star_saver.c optional star_saver
dev/syscons/syscons.c optional sc
dev/syscons/sysmouse.c optional sc
dev/syscons/warp/warp_saver.c optional warp_saver
dev/tcp_log/tcp_log_dev.c optional inet | inet6
dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux
dev/tdfx/tdfx_pci.c optional tdfx pci
dev/ti/if_ti.c optional ti pci
@ -4309,6 +4310,7 @@ netinet/tcp_debug.c optional tcpdebug
netinet/tcp_fastopen.c optional inet tcp_rfc7413 | inet6 tcp_rfc7413
netinet/tcp_hostcache.c optional inet | inet6
netinet/tcp_input.c optional inet | inet6
netinet/tcp_log_buf.c optional inet | inet6
netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6

View file

@ -0,0 +1,521 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016-2017
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/poll.h>
#include <sys/queue.h>
#include <sys/refcount.h>
#include <sys/mutex.h>
#include <sys/selinfo.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/tree.h>
#include <sys/uio.h>
#include <machine/atomic.h>
#include <sys/counter.h>
#include <dev/tcp_log/tcp_log_dev.h>
#ifdef TCPLOG_DEBUG_COUNTERS
extern counter_u64_t tcp_log_que_read;
extern counter_u64_t tcp_log_que_freed;
#endif
static struct cdev *tcp_log_dev;
static struct selinfo tcp_log_sel;
static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
static int tcp_log_dev_listeners = 0;
static struct mtx tcp_log_dev_queue_lock;
#define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock)
#define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock)
#define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
#define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
#define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt))
#define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt))
static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
static void tcp_log_dev_clear_cdevpriv(void *data);
static int tcp_log_dev_open(struct cdev *dev __unused, int flags,
int devtype __unused, struct thread *td __unused);
static int tcp_log_dev_write(struct cdev *dev __unused,
struct uio *uio __unused, int flags __unused);
static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
int flags __unused);
static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
caddr_t data, int fflag __unused, struct thread *td __unused);
static int tcp_log_dev_poll(struct cdev *dev __unused, int events,
struct thread *td);
enum tcp_log_dev_queue_lock_state {
QUEUE_UNLOCKED = 0,
QUEUE_LOCKED,
};
static struct cdevsw tcp_log_cdevsw = {
.d_version = D_VERSION,
.d_read = tcp_log_dev_read,
.d_open = tcp_log_dev_open,
.d_write = tcp_log_dev_write,
.d_poll = tcp_log_dev_poll,
.d_ioctl = tcp_log_dev_ioctl,
#ifdef NOTYET
.d_mmap = tcp_log_dev_mmap,
#endif
.d_name = "tcp_log",
};
static __inline void
tcp_log_dev_queue_validate_lock(int lockstate)
{
#ifdef INVARIANTS
switch (lockstate) {
case QUEUE_LOCKED:
TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
break;
case QUEUE_UNLOCKED:
TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
break;
default:
kassert_panic("%s:%d: unknown queue lock state", __func__,
__LINE__);
}
#endif
}
/*
* Clear the refcount. If appropriate, it will remove the entry from the
* queue and call the destructor.
*
* This must be called with the queue lock held.
*/
static void
tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
{
KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
#ifdef TCPLOG_DEBUG_COUNTERS
counter_u64_add(tcp_log_que_freed, 1);
#endif
/* Remove the entry from the queue and call the destructor. */
STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
tldq_queue);
(*entry->tldq_dtor)(entry);
}
}
static void
tcp_log_dev_clear_cdevpriv(void *data)
{
struct tcp_log_dev_info *priv;
struct tcp_log_dev_queue *entry, *entry_tmp;
priv = (struct tcp_log_dev_info *)data;
if (priv == NULL)
return;
/*
* Lock the queue and drop our references. We hold references to all
* the entries starting with tldi_head (or, if tldi_head == NULL, all
* entries in the queue).
*
* Because we don't want anyone adding addition things to the queue
* while we are doing this, we lock the queue.
*/
TCP_LOG_DEV_QUEUE_LOCK();
if (priv->tldi_head != NULL) {
entry = priv->tldi_head;
STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
tldq_queue, entry_tmp) {
tcp_log_dev_clear_refcount(entry);
}
}
tcp_log_dev_listeners--;
KASSERT(tcp_log_dev_listeners >= 0,
("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
tldi_list);
TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
TCP_LOG_DEV_QUEUE_UNLOCK();
free(priv, M_TCPLOGDEV);
}
static int
tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
struct thread *td __unused)
{
struct tcp_log_dev_info *priv;
struct tcp_log_dev_queue *entry;
int rv;
/*
* Ideally, we shouldn't see these because of file system
* permissions.
*/
if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
return (ENODEV);
/* Allocate space to hold information about where we are. */
priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
M_ZERO | M_WAITOK);
/* Stash the private data away. */
rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
if (!rv) {
/*
* Increase the listener count, add this reader to the list, and
* take references on all current queues.
*/
TCP_LOG_DEV_QUEUE_LOCK();
tcp_log_dev_listeners++;
STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
if (priv->tldi_head != NULL)
priv->tldi_cur = priv->tldi_head->tldq_buf;
STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
TCP_LOG_DEV_QUEUE_REF(entry);
TCP_LOG_DEV_QUEUE_UNLOCK();
} else {
/* Free the entry. */
free(priv, M_TCPLOGDEV);
}
return (rv);
}
static int
tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
int flags __unused)
{
return (ENODEV);
}
static __inline void
tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
{
struct tcp_log_dev_queue *entry;
KASSERT(priv->tldi_head != NULL,
("%s:%d: priv->tldi_head unexpectedly NULL",
__func__, __LINE__));
KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
("%s:%d: buffer mismatch (%p vs %p)",
__func__, __LINE__, priv->tldi_head->tldq_buf,
priv->tldi_cur));
tcp_log_dev_queue_validate_lock(*lockstate);
if (*lockstate == QUEUE_UNLOCKED) {
TCP_LOG_DEV_QUEUE_LOCK();
*lockstate = QUEUE_LOCKED;
}
entry = priv->tldi_head;
priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
tcp_log_dev_clear_refcount(entry);
priv->tldi_cur = NULL;
}
static int
tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
{
struct tcp_log_common_header *buf;
struct tcp_log_dev_info *priv;
struct tcp_log_dev_queue *entry;
ssize_t len;
int lockstate, rv;
/* Get our private info. */
rv = devfs_get_cdevpriv((void **)&priv);
if (rv)
return (rv);
lockstate = QUEUE_UNLOCKED;
/* Do we need to get a new buffer? */
while (priv->tldi_cur == NULL ||
priv->tldi_cur->tlch_length <= priv->tldi_off) {
/* Did we somehow forget to rotate? */
KASSERT(priv->tldi_cur == NULL,
("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
__LINE__));
if (priv->tldi_cur != NULL)
tcp_log_dev_rotate_bufs(priv, &lockstate);
/*
* Before we start looking at tldi_head, we need a lock on the
* queue to make sure tldi_head stays stable.
*/
if (lockstate == QUEUE_UNLOCKED) {
TCP_LOG_DEV_QUEUE_LOCK();
lockstate = QUEUE_LOCKED;
}
/* We need the next buffer. Do we have one? */
if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
rv = EAGAIN;
goto done;
}
if (priv->tldi_head == NULL) {
/* Sleep and wait for more things we can read. */
rv = mtx_sleep(&tcp_log_dev_listeners,
&tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
if (rv)
goto done;
if (priv->tldi_head == NULL)
continue;
}
/*
* We have an entry to read. We want to try to create a
* buffer, if one doesn't already exist.
*/
entry = priv->tldi_head;
if (entry->tldq_buf == NULL) {
TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
buf = (*entry->tldq_xform)(entry);
if (buf == NULL) {
rv = EBUSY;
goto done;
}
entry->tldq_buf = buf;
}
priv->tldi_cur = entry->tldq_buf;
priv->tldi_off = 0;
}
/* Copy what we can from this buffer to the output buffer. */
if (uio->uio_resid > 0) {
/* Drop locks so we can take page faults. */
if (lockstate == QUEUE_LOCKED)
TCP_LOG_DEV_QUEUE_UNLOCK();
lockstate = QUEUE_UNLOCKED;
KASSERT(priv->tldi_cur != NULL,
("%s: priv->tldi_cur is unexpectedly NULL", __func__));
/* Copy as much as we can to this uio. */
len = priv->tldi_cur->tlch_length - priv->tldi_off;
if (len > uio->uio_resid)
len = uio->uio_resid;
rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
len, uio);
if (rv != 0)
goto done;
priv->tldi_off += len;
#ifdef TCPLOG_DEBUG_COUNTERS
counter_u64_add(tcp_log_que_read, len);
#endif
}
/* Are we done with this buffer? If so, find the next one. */
if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
("%s: offset (%ju) exceeds length (%ju)", __func__,
(uintmax_t)priv->tldi_off,
(uintmax_t)priv->tldi_cur->tlch_length));
tcp_log_dev_rotate_bufs(priv, &lockstate);
}
done:
tcp_log_dev_queue_validate_lock(lockstate);
if (lockstate == QUEUE_LOCKED)
TCP_LOG_DEV_QUEUE_UNLOCK();
return (rv);
}
static int
tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
int fflag __unused, struct thread *td __unused)
{
struct tcp_log_dev_info *priv;
int rv;
/* Get our private info. */
rv = devfs_get_cdevpriv((void **)&priv);
if (rv)
return (rv);
/*
* Set things. Here, we are most concerned about the non-blocking I/O
* flag.
*/
rv = 0;
switch (cmd) {
case FIONBIO:
break;
case FIOASYNC:
if (*(int *)data != 0)
rv = EINVAL;
break;
default:
rv = ENOIOCTL;
}
return (rv);
}
static int
tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
{
struct tcp_log_dev_info *priv;
int revents;
/*
* Get our private info. If this fails, claim that all events are
* ready. That should prod the user to do something that will
* make the error evident to them.
*/
if (devfs_get_cdevpriv((void **)&priv))
return (events);
revents = 0;
if (events & (POLLIN | POLLRDNORM)) {
/*
* We can (probably) read right now if we are partway through
* a buffer or if we are just about to start a buffer.
* Because we are going to read tldi_head, we should acquire
* a read lock on the queue.
*/
TCP_LOG_DEV_QUEUE_LOCK();
if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
(priv->tldi_cur != NULL &&
priv->tldi_off < priv->tldi_cur->tlch_length))
revents = events & (POLLIN | POLLRDNORM);
else
selrecord(td, &tcp_log_sel);
TCP_LOG_DEV_QUEUE_UNLOCK();
} else {
/*
* It only makes sense to poll for reading. So, again, prod the
* user to do something that will make the error of their ways
* apparent.
*/
revents = events;
}
return (revents);
}
int
tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
{
struct tcp_log_dev_info *priv;
int rv;
bool wakeup_needed;
KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
("%s: Called with both tldq_buf and tldq_xform set to NULL",
__func__));
KASSERT(entry->tldq_dtor != NULL,
("%s: Called with tldq_dtor set to NULL", __func__));
/* Get a lock on the queue. */
TCP_LOG_DEV_QUEUE_LOCK();
/* If no one is listening, tell the caller to free the resources. */
if (tcp_log_dev_listeners == 0) {
rv = ENXIO;
goto done;
}
/* Add this to the end of the tailq. */
STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
/* Add references for all current listeners. */
refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
/*
* If any listener is currently stuck on NULL, that means they are
* waiting. Point their head to this new entry.
*/
wakeup_needed = false;
STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
if (priv->tldi_head == NULL) {
priv->tldi_head = entry;
wakeup_needed = true;
}
if (wakeup_needed) {
selwakeup(&tcp_log_sel);
wakeup(&tcp_log_dev_listeners);
}
rv = 0;
done:
TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
TCP_LOG_DEV_QUEUE_UNLOCK();
return (rv);
}
static int
tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
{
/* TODO: Support intelligent unloading. */
switch (type) {
case MOD_LOAD:
if (bootverbose)
printf("tcp_log: tcp_log device\n");
memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
"tcp_log device queues", MTX_DEF);
tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
&tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
"tcp_log");
break;
default:
return (EOPNOTSUPP);
}
return (0);
}
DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
MODULE_VERSION(tcp_log_dev, 1);

View file

@ -0,0 +1,88 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __tcp_log_dev_h__
#define __tcp_log_dev_h__
/*
* This is the common header for data streamed from the log device. All
* blocks of data need to start with this header.
*/
struct tcp_log_common_header {
uint32_t tlch_version; /* Version is specific to type. */
uint32_t tlch_type; /* Type of entry(ies) that follow. */
uint64_t tlch_length; /* Total length, including header. */
} __packed;
#define TCP_LOG_DEV_TYPE_BBR 1 /* black box recorder */
#ifdef _KERNEL
/*
* This is a queue entry. All queue entries need to start with this structure
* so the common code can cast them to this structure; however, other modules
* are free to include additional data after this structure.
*
* The elements are explained here:
* tldq_queue: used by the common code to maintain this entry's position in the
* queue.
* tldq_buf: should be NULL, or a pointer to a chunk of data. The data must be
* as long as the common header indicates.
* tldq_xform: If tldq_buf is NULL, the code will call this to create the
* the tldq_buf object. The function should *not* directly modify tldq_buf,
* but should return the buffer (which must meet the restrictions
* indicated for tldq_buf).
* tldq_dtor: This function is called to free the queue entry. If tldq_buf is
* not NULL, the dtor function must free that, too.
* tldq_refcnt: used by the common code to indicate how many readers still need
* this data.
*/
struct tcp_log_dev_queue {
STAILQ_ENTRY(tcp_log_dev_queue) tldq_queue;
struct tcp_log_common_header *tldq_buf;
struct tcp_log_common_header *(*tldq_xform)(struct tcp_log_dev_queue *entry);
void (*tldq_dtor)(struct tcp_log_dev_queue *entry);
volatile u_int tldq_refcnt;
};
STAILQ_HEAD(log_queueh, tcp_log_dev_queue);
struct tcp_log_dev_info {
STAILQ_ENTRY(tcp_log_dev_info) tldi_list;
struct tcp_log_dev_queue *tldi_head;
struct tcp_log_common_header *tldi_cur;
off_t tldi_off;
};
STAILQ_HEAD(log_infoh, tcp_log_dev_info);
MALLOC_DECLARE(M_TCPLOGDEV);
int tcp_log_dev_add_log(struct tcp_log_dev_queue *entry);
#endif /* _KERNEL */
#endif /* !__tcp_log_dev_h__ */

View file

@ -639,6 +639,14 @@ static struct witness_order_list_entry order_lists[] = {
{ "dr->dt.di.dr_mtx", &lock_class_sx },
{ "db->db_mtx", &lock_class_sx },
{ NULL, NULL },
/*
* TCP log locks
*/
{ "TCP ID tree", &lock_class_rw },
{ "tcp log id bucket", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
{ "TCP log expireq", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* spin locks
*/

View file

@ -168,6 +168,12 @@ struct tcphdr {
#define TCP_NOOPT 8 /* don't use TCP options */
#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */
#define TCP_INFO 32 /* retrieve tcp_info structure */
#define TCP_LOG 34 /* configure event logging for connection */
#define TCP_LOGBUF 35 /* retrieve event log for connection */
#define TCP_LOGID 36 /* configure log ID to correlate connections */
#define TCP_LOGDUMP 37 /* dump connection log events to device */
#define TCP_LOGDUMPID 38 /* dump events from connections with same ID to
device */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
@ -189,6 +195,9 @@ struct tcphdr {
#define TCPI_OPT_ECN 0x08
#define TCPI_OPT_TOE 0x10
/* Maximum length of log ID. */
#define TCP_LOG_ID_LEN 64
/*
* The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
* the caller to query certain information about the state of a TCP

View file

@ -102,6 +102,7 @@ __FBSDID("$FreeBSD$");
#include <netinet6/nd6.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
@ -1592,6 +1593,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/* Save segment, if requested. */
tcp_pcap_add(th, m, &(tp->t_inpkts));
#endif
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
tlen, NULL, true);
if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {

2480
sys/netinet/tcp_log_buf.c Normal file

File diff suppressed because it is too large Load diff

353
sys/netinet/tcp_log_buf.h Normal file
View file

@ -0,0 +1,353 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2016-2018
* Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __tcp_log_buf_h__
#define __tcp_log_buf_h__
#define TCP_LOG_REASON_LEN 32
#define TCP_LOG_BUF_VER (6)
/*
* Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires
* 8-byte alignment to work properly on all platforms. Therefore, we will
* enforce 8-byte alignment for all the structures that may appear by
* themselves (instead of being embedded in another structure) in a data
* stream.
*/
#define ALIGN_TCP_LOG __aligned(8)
/* Information about the socketbuffer state. */
struct tcp_log_sockbuf
{
uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */
uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */
uint32_t tls_sb_spare; /* spare */
};
/* Optional, verbose information that may be appended to an event log. */
struct tcp_log_verbose
{
#define TCP_FUNC_LEN 32
char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */
char tlv_trace_func[TCP_FUNC_LEN]; /* Function that
generated trace */
uint32_t tlv_trace_line; /* Line number that generated trace */
uint8_t _pad[4];
} ALIGN_TCP_LOG;
/* Internal RACK state variables. */
struct tcp_log_rack
{
uint32_t tlr_rack_rtt; /* rc_rack_rtt */
uint8_t tlr_state; /* Internal RACK state */
uint8_t _pad[3]; /* Padding */
};
struct tcp_log_bbr {
uint64_t cur_del_rate;
uint64_t delRate;
uint64_t rttProp;
uint64_t bw_inuse;
uint32_t inflight;
uint32_t applimited;
uint32_t delivered;
uint32_t timeStamp;
uint32_t epoch;
uint32_t lt_epoch;
uint32_t pkts_out;
uint32_t flex1;
uint32_t flex2;
uint32_t flex3;
uint32_t flex4;
uint32_t flex5;
uint32_t flex6;
uint32_t lost;
uint16_t pacing_gain;
uint16_t cwnd_gain;
uint16_t flex7;
uint8_t bbr_state;
uint8_t bbr_substate;
uint8_t inpacer;
uint8_t ininput;
uint8_t use_lt_bw;
uint8_t flex8;
uint32_t pkt_epoch;
};
/* Per-stack stack-specific info. */
union tcp_log_stackspecific
{
struct tcp_log_rack u_rack;
struct tcp_log_bbr u_bbr;
};
struct tcp_log_buffer
{
/* Event basics */
struct timeval tlb_tv; /* Timestamp of trace */
uint32_t tlb_ticks; /* Timestamp of trace */
uint32_t tlb_sn; /* Serial number */
uint8_t tlb_stackid; /* Stack ID */
uint8_t tlb_eventid; /* Event ID */
uint16_t tlb_eventflags; /* Flags for the record */
#define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */
#define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */
#define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */
#define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */
#define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */
int tlb_errno; /* Event error (if any) */
/* Internal session state */
struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */
struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */
int tlb_state; /* TCPCB t_state */
uint32_t tlb_starttime; /* TCPCB t_starttime */
uint32_t tlb_iss; /* TCPCB iss */
uint32_t tlb_flags; /* TCPCB flags */
uint32_t tlb_snd_una; /* TCPCB snd_una */
uint32_t tlb_snd_max; /* TCPCB snd_max */
uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */
uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */
uint32_t tlb_snd_recover;/* TCPCB snd_recover */
uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */
uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */
uint32_t tlb_srtt; /* TCPCB t_srtt */
uint32_t tlb_rttvar; /* TCPCB t_rttvar */
uint32_t tlb_rcv_up; /* TCPCB rcv_up */
uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */
uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */
tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */
uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */
uint32_t tlb_dupacks; /* TCPCB t_dupacks */
int tlb_segqlen; /* TCPCB segqlen */
int tlb_snd_numholes; /* TCPCB snd_numholes */
uint32_t tlb_flex1; /* Event specific information */
uint32_t tlb_flex2; /* Event specific information */
uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */
tlb_rcv_scale:4; /* TCPCB rcv_scale */
uint8_t _pad[3]; /* Padding */
/* Per-stack info */
union tcp_log_stackspecific tlb_stackinfo;
#define tlb_rack tlb_stackinfo.u_rack
/* The packet */
uint32_t tlb_len; /* The packet's data length */
struct tcphdr tlb_th; /* The TCP header */
uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */
/* Verbose information (optional) */
struct tcp_log_verbose tlb_verbose[0];
} ALIGN_TCP_LOG;
enum tcp_log_events {
TCP_LOG_IN = 1, /* Incoming packet 1 */
TCP_LOG_OUT, /* Transmit (without other event) 2 */
TCP_LOG_RTO, /* Retransmit timeout 3 */
TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER,/* Detected reorder 7 */
TCP_LOG_PACER, /* Pacer sending a packet 8 */
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */
BBR_LOG_TIMERSTAR, /* Start a timer 13 */
BBR_LOG_TIMERCANC, /* Cancel a timer 14 */
BBR_LOG_ENTREC, /* Entered recovery 15 */
BBR_LOG_EXITREC, /* Exited recovery 16 */
BBR_LOG_CWND, /* Cwnd change 17 */
BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */
BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */
BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */
BBR_LOG_JUSTRET, /* We just returned out of output 21 */
BBR_LOG_STATE, /* A BBR state change occured 22 */
BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */
BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
TCP_LOG_FLOWEND, /* End of a flow 25 */
BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
UNUSED_32, /* Unused 32 */
UNUSED_33, /* Unused 33 */
BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
BBR_LOG_TO_PROCESS, /* A to was processed 35 */
BBR_LOG_BBRTSO, /* TSO update 36 */
BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
BBR_LOG_PROGRESS, /* Progress timer event 39 */
TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
TCP_LOG_END /* End (keep at end) 49 */
};
enum tcp_log_states {
TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */
TCP_LOG_STATE_OFF = 0, /* Pause */
TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */
TCP_LOG_STATE_HEAD=2, /* Keep the leading events */
TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and
automatically dump them to the
device */
TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */
TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and
automatically dump them when the
session ends */
};
/* Use this if we don't know whether the operation succeeded. */
#define ERRNO_UNK (-1)
/*
* If the user included dev/tcp_log/tcp_log_dev.h, then include our private
* headers. Otherwise, there is no reason to pollute all the files with an
* additional include.
*
* This structure is aligned to an 8-byte boundary to match the alignment
* requirements of (struct tcp_log_buffer).
*/
#ifdef __tcp_log_dev_h__
struct tcp_log_header {
struct tcp_log_common_header tlh_common;
#define tlh_version tlh_common.tlch_version
#define tlh_type tlh_common.tlch_type
#define tlh_length tlh_common.tlch_length
struct in_endpoints tlh_ie;
struct timeval tlh_offset; /* Uptime -> UTC offset */
char tlh_id[TCP_LOG_ID_LEN];
char tlh_reason[TCP_LOG_REASON_LEN];
uint8_t tlh_af;
uint8_t _pad[7];
} ALIGN_TCP_LOG;
#ifdef _KERNEL
struct tcp_log_dev_log_queue {
struct tcp_log_dev_queue tldl_common;
char tldl_id[TCP_LOG_ID_LEN];
char tldl_reason[TCP_LOG_REASON_LEN];
struct in_endpoints tldl_ie;
struct tcp_log_stailq tldl_entries;
int tldl_count;
uint8_t tldl_af;
};
#endif /* _KERNEL */
#endif /* __tcp_log_dev_h__ */
#ifdef _KERNEL
#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
/*
* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
* tries to record verbose information.
*/
#define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
do { \
if (tp->t_logstate != TCP_LOG_STATE_OFF) \
tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
errornum, len, stackinfo, th_hostorder, \
tp->t_output_caller, __func__, __LINE__, tv); \
} while (0)
/*
* TCP_LOG_EVENT: This is a macro so we can capture function/line
* information when needed.
*
* Prototype:
* TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
* struct sockbuf *txbuf, uint8_t eventid, int errornum,
* union tcp_log_stackspecific *stackinfo)
*
* tp is mandatory and must be write locked.
* th is optional; if present, it will appear in the record.
* rxbuf and txbuf are optional; if present, they will appear in the record.
* eventid is mandatory.
* errornum is mandatory (it indicates the success or failure of the
* operation associated with the event).
* len indicates the length of the packet. If no packet, use 0.
* stackinfo is optional; if present, it will appear in the record.
*/
#ifdef TCP_LOG_FORCEVERBOSE
#define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE
#else
#define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \
do { \
if (tcp_log_verbose) \
TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \
eventid, errornum, len, stackinfo, \
th_hostorder, NULL); \
else if (tp->t_logstate != TCP_LOG_STATE_OFF) \
tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
errornum, len, stackinfo, th_hostorder, \
NULL, NULL, 0, NULL); \
} while (0)
#endif /* TCP_LOG_FORCEVERBOSE */
#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
do { \
if (tp->t_logstate != TCP_LOG_STATE_OFF) \
tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
errornum, len, stackinfo, th_hostorder, \
NULL, NULL, 0, tv); \
} while (0)
extern bool tcp_log_verbose;
void tcp_log_drain(struct tcpcb *tp);
int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force);
void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason);
struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
union tcp_log_stackspecific *stackinfo, int th_hostorder,
const char *output_caller, const char *func, int line, const struct timeval *tv);
size_t tcp_log_get_id(struct tcpcb *tp, char *buf);
u_int tcp_log_get_id_cnt(struct tcpcb *tp);
int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp);
void tcp_log_init(void);
int tcp_log_set_id(struct tcpcb *tp, char *id);
int tcp_log_state_change(struct tcpcb *tp, int state);
void tcp_log_tcpcbinit(struct tcpcb *tp);
void tcp_log_tcpcbfini(struct tcpcb *tp);
void tcp_log_flowend(struct tcpcb *tp);
#endif /* _KERNEL */
#endif /* __tcp_log_buf_h__ */

View file

@ -74,6 +74,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
@ -1310,6 +1311,10 @@ tcp_output(struct tcpcb *tp)
}
#endif
/* We're getting ready to send; log now. */
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
len, NULL, false);
/*
* Enable TSO and specify the size of the segments.
* The TCP pseudo header checksum is always provided.
@ -1549,6 +1554,9 @@ tcp_output(struct tcpcb *tp)
}
if (error) {
/* Record the error. */
TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT,
error, 0, NULL, false);
/*
* We know that the packet was lost, so back out the

View file

@ -98,6 +98,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
#include <netinet/cc/cc.h>
#ifdef INET6
@ -425,6 +426,71 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
NULL, 0, sysctl_net_inet_list_available, "A",
"list available TCP Function sets");
/*
* Exports one (struct tcp_function_id) for each non-alias.
*/
static int
sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
{
int error, cnt;
struct tcp_function *f;
struct tcp_function_id tfi;
/*
* We don't allow writes.
*/
if (req->newptr != NULL)
return (EINVAL);
/*
* Wire the old buffer so we can directly copy the functions to
* user space without dropping the lock.
*/
if (req->oldptr != NULL) {
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
}
/*
* Walk the list, comparing the name of the function entry and
* function block to determine which is an alias.
* If exporting the list, copy out matching entries. Otherwise,
* just record the total length.
*/
cnt = 0;
rw_rlock(&tcp_function_lock);
TAILQ_FOREACH(f, &t_functions, tf_next) {
if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name,
TCP_FUNCTION_NAME_LEN_MAX))
continue;
if (req->oldptr != NULL) {
tfi.tfi_id = f->tf_fb->tfb_id;
(void)strncpy(tfi.tfi_name, f->tf_name,
TCP_FUNCTION_NAME_LEN_MAX);
tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
/*
* Don't stop on error, as that is the
* mechanism we use to accumulate length
* information if the buffer was too short.
*/
} else
cnt++;
}
rw_runlock(&tcp_function_lock);
if (req->oldptr == NULL)
error = SYSCTL_OUT(req, NULL,
(cnt + 1) * sizeof(struct tcp_function_id));
return (error);
}
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids,
CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id",
"List TCP function block name-to-ID mappings");
/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
@ -504,6 +570,8 @@ maketcp_hashsize(int size)
return (hashsize);
}
static volatile int next_tcp_stack_id = 1;
/*
* Register a TCP function block with the name provided in the names
* array. (Note that this function does NOT automatically register
@ -563,6 +631,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
refcount_init(&blk->tfb_refcnt, 0);
blk->tfb_flags = 0;
blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
for (i = 0; i < *num_names; i++) {
n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
if (n == NULL) {
@ -779,6 +848,8 @@ tcp_init(void)
/* Setup the tcp function block list */
init_tcp_functions();
register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
/* Initialize the TCP logging data. */
tcp_log_init();
if (tcp_soreceive_stream) {
#ifdef INET
@ -1360,6 +1431,8 @@ tcp_newtcpcb(struct inpcb *inp)
*/
tcp_pcap_tcpcb_init(tp);
#endif
/* Initialize the per-TCPCB log data. */
tcp_log_tcpcbinit(tp);
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
@ -1577,6 +1650,7 @@ tcp_discardcb(struct tcpcb *tp)
inp->inp_ppcb = NULL;
if (tp->t_timers->tt_draincnt == 0) {
/* We own the last reference on tcpcb, let's free it. */
tcp_log_tcpcbfini(tp);
TCPSTATES_DEC(tp->t_state);
if (tp->t_fb->tfb_tcp_fb_fini)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
@ -1607,6 +1681,7 @@ tcp_timer_discard(void *ptp)
tp->t_timers->tt_draincnt--;
if (tp->t_timers->tt_draincnt == 0) {
/* We own the last reference on this tcpcb, let's free it. */
tcp_log_tcpcbfini(tp);
TCPSTATES_DEC(tp->t_state);
if (tp->t_fb->tfb_tcp_fb_fini)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
@ -1700,6 +1775,7 @@ tcp_drain(void)
if ((tcpb = intotcpcb(inpb)) != NULL) {
tcp_reass_flush(tcpb);
tcp_clean_sackreport(tcpb);
tcp_log_drain(tcpb);
#ifdef TCPPCAP
if (tcp_pcap_aggressive_free) {
/* Free the TCP PCAP queues. */
@ -2856,6 +2932,7 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
xt->t_state = TCPS_TIME_WAIT;
} else {
xt->t_state = tp->t_state;
xt->t_logstate = tp->t_logstate;
xt->t_flags = tp->t_flags;
xt->t_sndzerowin = tp->t_sndzerowin;
xt->t_sndrexmitpack = tp->t_sndrexmitpack;
@ -2879,6 +2956,8 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
TCP_FUNCTION_NAME_LEN_MAX);
bzero(xt->xt_logid, TCP_LOG_ID_LEN);
(void)tcp_log_get_id(tp, xt->xt_logid);
}
xt->xt_len = sizeof(struct xtcpcb);

View file

@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/cc/cc.h>
@ -644,6 +645,7 @@ tcp_timer_rexmt(void * xtp)
KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
("%s: tp %p tcpcb can't be stopped here", __func__, tp));
tcp_free_sackholes(tp);
TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
if (tp->t_fb->tfb_tcp_rexmit_tmr) {
/* The stack has a timer action too. */
(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);

View file

@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_fastopen.h>
@ -1026,6 +1027,11 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->t_flags &= ~TF_FORCEDATA;
}
}
TCP_LOG_EVENT(tp, NULL,
&inp->inp_socket->so_rcv,
&inp->inp_socket->so_snd,
TCP_LOG_USERSEND, error,
0, NULL, false);
out:
TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
@ -1533,6 +1539,15 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
}
/*
* If this assert becomes untrue, we need to change the size of the buf
* variable in tcp_default_ctloutput().
*/
#ifdef CTASSERT
CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
#endif
int
tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
{
@ -1540,7 +1555,7 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
u_int ui;
struct tcp_info ti;
struct cc_algo *algo;
char *pbuf, buf[TCP_CA_NAME_MAX];
char *pbuf, buf[TCP_LOG_ID_LEN];
size_t len;
/*
@ -1822,6 +1837,55 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
goto unlock_and_done;
}
case TCP_LOG:
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
return (error);
INP_WLOCK_RECHECK(inp);
error = tcp_log_state_change(tp, optval);
goto unlock_and_done;
case TCP_LOGBUF:
INP_WUNLOCK(inp);
error = EINVAL;
break;
case TCP_LOGID:
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
if (error)
break;
buf[sopt->sopt_valsize] = '\0';
INP_WLOCK_RECHECK(inp);
error = tcp_log_set_id(tp, buf);
/* tcp_log_set_id() unlocks the INP. */
break;
case TCP_LOGDUMP:
case TCP_LOGDUMPID:
INP_WUNLOCK(inp);
error =
sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
if (error)
break;
buf[sopt->sopt_valsize] = '\0';
INP_WLOCK_RECHECK(inp);
if (sopt->sopt_name == TCP_LOGDUMP) {
error = tcp_log_dump_tp_logbuf(tp, buf,
M_WAITOK, true);
INP_WUNLOCK(inp);
} else {
tcp_log_dump_tp_bucket_logbufs(tp, buf);
/*
* tcp_log_dump_tp_bucket_logbufs() drops the
* INP lock.
*/
}
break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@ -1907,6 +1971,25 @@ tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
case TCP_LOG:
optval = tp->t_logstate;
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &optval, sizeof(optval));
break;
case TCP_LOGBUF:
/* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
error = tcp_log_getlogbuf(sopt, tp);
break;
case TCP_LOGID:
len = tcp_log_get_id(tp, buf);
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, buf, len + 1);
break;
case TCP_LOGDUMP:
case TCP_LOGDUMPID:
INP_WUNLOCK(inp);
error = EINVAL;
break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;

View file

@ -79,6 +79,8 @@ struct sackhint {
uint64_t _pad[1]; /* TBD */
};
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
/*
* Tcp control block, one per tcp; fields:
* Organized for 16 byte cacheline efficiency.
@ -189,6 +191,13 @@ struct tcpcb {
u_int t_tsomaxsegcount; /* TSO maximum segment count */
u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
u_int t_flags2; /* More tcpcb flags storage */
int t_logstate; /* State of "black box" logging */
struct tcp_log_stailq t_logs; /* Log buffer */
int t_lognum; /* Number of log entries */
uint32_t t_logsn; /* Log "serial number" */
struct tcp_log_id_node *t_lin;
struct tcp_log_id_bucket *t_lib;
const char *t_output_caller; /* Function that called tcp_output */
struct tcp_function_block *t_fb;/* TCP function call block */
void *t_fb_ptr; /* Pointer to t_fb specific data */
uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
@ -267,6 +276,7 @@ struct tcp_function_block {
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
};
struct tcp_function {
@ -339,11 +349,12 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
#define TCPOOB_HADDATA 0x02
/*
* Flags for PLPMTU handling, t_flags2
* Flags for the extended TCP flags field, t_flags2
*/
#define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */
#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */
#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */
#define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */
/*
* Structure to hold TCP options that are only used during segment
@ -654,6 +665,7 @@ struct xtcpcb {
size_t xt_len; /* length of this structure */
struct xinpcb xt_inp;
char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */
char xt_logid[TCP_LOG_ID_LEN]; /* (s) */
int64_t spare64[8];
int32_t t_state; /* (s,p) */
uint32_t t_flags; /* (s,p) */
@ -666,13 +678,23 @@ struct xtcpcb {
int32_t tt_keep; /* (s) */
int32_t tt_2msl; /* (s) */
int32_t tt_delack; /* (s) */
int32_t t_logstate; /* (3) */
int32_t spare32[32];
} __aligned(8);
#ifdef _KERNEL
void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
#endif
#endif
/*
* TCP function name-to-id mapping exported to user-land via sysctl(3).
*/
struct tcp_function_id {
uint8_t tfi_id;
char tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
};
/*
* Identifiers for TCP sysctl nodes
*/

View file

@ -321,7 +321,7 @@ protopr(u_long off, const char *name, int af1, int proto)
"Proto", "Recv-Q", "Send-Q",
"Local Address", "Foreign Address");
if (!xflag && !Rflag)
xo_emit(" (state)");
xo_emit(" {T:/%-11.11s}", "(state)");
}
if (xflag) {
xo_emit(" {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} "
@ -339,6 +339,8 @@ protopr(u_long off, const char *name, int af1, int proto)
xo_emit(" {T:/%8.8s} {T:/%5.5s}",
"flowid", "ftype");
}
if (Pflag)
xo_emit(" {T:/%s}", "Log ID");
xo_emit("\n");
first = 0;
}
@ -478,9 +480,9 @@ protopr(u_long off, const char *name, int af1, int proto)
}
if (istcp && !Lflag && !xflag && !Tflag && !Rflag) {
if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES)
xo_emit("{:tcp-state/%d}", tp->t_state);
xo_emit("{:tcp-state/%-11d}", tp->t_state);
else {
xo_emit("{:tcp-state/%s}",
xo_emit("{:tcp-state/%-11s}",
tcpstates[tp->t_state]);
#if defined(TF_NEEDSYN) && defined(TF_NEEDFIN)
/* Show T/TCP `hidden state' */
@ -495,6 +497,9 @@ protopr(u_long off, const char *name, int af1, int proto)
inp->inp_flowid,
inp->inp_flowtype);
}
if (istcp && Pflag)
xo_emit(" {:log-id/%s}", tp->xt_logid[0] == '\0' ?
"-" : tp->xt_logid);
xo_emit("\n");
xo_close_instance("socket");
}

View file

@ -214,6 +214,7 @@ int mflag; /* show memory stats */
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
static int Qflag; /* show netisr information */
int rflag; /* show routing tables (or routing stats) */
@ -247,7 +248,7 @@ main(int argc, char *argv[])
if (argc < 0)
exit(EXIT_FAILURE);
while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:np:Qq:RrSTsuWw:xz"))
while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@ -344,6 +345,9 @@ main(int argc, char *argv[])
case 'n':
numeric_addr = numeric_port = 1;
break;
case 'P':
Pflag = 1;
break;
case 'p':
if ((tp = name2protox(optarg)) == NULL) {
xo_errx(1, "%s: unknown or uninstrumented "

View file

@ -39,7 +39,7 @@
.Bl -tag -width "netstat"
.It Nm
.Op Fl -libxo
.Op Fl 46AaLnRSTWx
.Op Fl 46AaLnPRSTWx
.Op Fl f Ar protocol_family | Fl p Ar protocol
.Op Fl M Ar core
.Op Fl N Ar system
@ -181,6 +181,8 @@ and the third count is the maximum number of queued connections.
Do not resolve numeric addresses and port numbers to names.
See
.Sx GENERAL OPTIONS .
.It Fl P
Display the log ID for each socket.
.It Fl R
Display the flowid and flowtype for each socket.
flowid is a 32 bit hardware specific identifier for each flow.

View file

@ -50,6 +50,7 @@ extern int mflag; /* show memory stats */
extern int noutputs; /* how much outputs before we exit */
extern int numeric_addr; /* show addresses numerically */
extern int numeric_port; /* show ports numerically */
extern int Pflag; /* show TCP log ID */
extern int rflag; /* show routing tables (or routing stats) */
extern int Rflag; /* show flowid / RSS information */
extern int sflag; /* show protocol statistics */