add in-kernel ttcp performance tool

svn path=/head/; revision=122592
2024-10-18 14:23:55 +00:00 · 2003-11-13 00:30:27 +00:00 · 2003-11-13 00:30:27 +00:00 · 891ea87e6a · 2020-12-20 02:59:44 +00:00
parent 30093b05a0
commit 891ea87e6a
7 changed files with 1196 additions and 0 deletions
--- a/tools/tools/README
+++ b/tools/tools/README
@ -28,6 +28,7 @@ kdrv		KernelDriver; add/list/remove third-party kernel driver
 kerncruft	Shellscript to find orphaned *.c files in /sys
 kerninclude	Shellscript to find unused #includes in the kernel.
 kernxref	Shellscript to cross reference symbols in the LINT kernel.
+kttcp		An in-kernel version of the ttcp network performance tool
 mid	 	Create a Message-ID database for mailing lists.
 pciid		Generate src/share/misc/pci_vendors.
 portsinfo 	Generate list of new ports for last two weeks.
--- a/tools/tools/kttcp/Makefile
+++ b/tools/tools/kttcp/Makefile
@ -0,0 +1,23 @@
+#	$FreeBSD$
+
+SHELL=	/bin/sh
+
+PROG=	kttcp
+SRCS=	kttcp.c
+BINDIR=	/usr/local/bin
+SYSDIR=	/usr/src/sys
+
+CFLAGS	+= -I${SYSDIR} -Isys
+
+all:	kttcp module
+
+module:
+	cd sys; SYSDIR=${SYSDIR} make
+
+install:
+	install kttcp ${DESTDIR}/${BINDIR}
+	cd sys; SYSDIR=${SYSDIR} make install
+
+clean:
+	rm -f ${PROG}
+	cd sys; SYSDIR=${SYSDIR} make clean
--- a/tools/tools/kttcp/README
+++ b/tools/tools/kttcp/README
@ -0,0 +1,24 @@
+$FreeBSD$
+
+This is a port of Jason Thorpe's kttcp tool for testing network
+performance for in-kernel applications (like NFS).  The tool consists
+of a loadable module and a small user-mode application.  Beware
+that you should match the kernel module to the kernel it is to be
+used with.  By default SYSDIR is set to /usr/src/sys in Makefile.
+You may want to change that.
+
+To use the tool do something like on each of two machines:
+
+1. make
+2. su; make install (installs module and kttcp in /usr/local/bin)
+3. kldload kttcp
+
+Then:
+
+4. kttcp -r on one machine
+5. kttcp -t foo on the other machine, where foo is the
+   machine where #4 was done.
+
+kttcp w/o arguments gives usage.  Otherwise the source is your
+friend.  Beware that the kernel code must mimic soreceive and sosend
+for results to be meaningful.
--- a/tools/tools/kttcp/kttcp.c
+++ b/tools/tools/kttcp/kttcp.c
@ -0,0 +1,309 @@
+/*	$FreeBSD$	*/
+/*	$NetBSD: kttcp.c,v 1.5 2002/07/11 23:32:35 simonb Exp $	*/
+
+/*
+ * Copyright (c) 2002 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden and Jason R. Thorpe
+ * for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed for the NetBSD Project by
+ *	Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+
+#include "dev/kttcp/kttcpio.h"
+
+#define	KTTCP_PORT		"22222"
+#define	KTTCP_XMITSIZE		(10*1024*1024)
+#define	KTTCP_SOCKBUF_DEFAULT	65536
+
+#define	KTTCP_DEVICE		"/dev/kttcp"
+
+static void
+usage(void)
+{
+	fprintf(stderr,
+	    "usage: kttcp -r [-b sockbufsize] [-p port] [-q] [-v]\n"
+	    "                [-4] [-6]\n"
+	    "       kttcp -t [-b sockbufsize] [-n bytes] [-q] [-v] [-p port]\n"
+	    "                [-4] [-6] host\n"
+	);
+	exit(1);
+}
+
+static unsigned long long
+get_bytes(const char *str)
+{
+	unsigned long long bytes;
+	char *cp;
+
+	bytes = strtoull(str, &cp, 10);
+	if (bytes == ULLONG_MAX && errno == ERANGE)
+		err(1, "%s", str);
+
+	if (cp[0] != '\0') {
+		if (cp[1] != '\0')
+			errx(1, "invalid byte count: %s", str);
+		if (cp[0] == 'k' || cp[0] == 'K')
+			bytes *= 1024;
+		else if (cp[0] == 'm' || cp[0] == 'M')
+			bytes *= 1024 * 1024;
+		else if (cp[0] == 'g' || cp[0] == 'G')
+			bytes *= 1024 * 1024 * 1024;
+		else
+			errx(1, "invalid byte count modifier: %s", str);
+	}
+
+	return (bytes);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int c, error, s, verbose, s2, kfd;
+	int xmitset, family;
+	int bufsize;
+	int ai_flag;
+	char *host;
+	char *portstr;
+	struct kttcp_io_args kio;
+	struct addrinfo hints, *addr, *res;
+	struct sockaddr_storage ss;
+	struct rusage rustart, ruend;
+	struct timeval tvtmp;
+	unsigned long long ull, usecs, bytespersec, bitspersec, xmitsize;
+	char connecthost[NI_MAXHOST];
+	socklen_t slen;
+	const int one = 1;
+	u_long cmd;
+
+	cmd = 0;
+	portstr = KTTCP_PORT;
+	verbose = 1;
+	xmitset = 0;
+	bufsize = KTTCP_SOCKBUF_DEFAULT;
+	xmitsize = KTTCP_XMITSIZE;
+	family = PF_UNSPEC;
+	while ((c = getopt(argc, argv, "46b:n:p:qrtvw:")) != -1) {
+		switch (c) {
+		case '4':
+			if (family != PF_UNSPEC)
+				usage();
+			family = PF_INET;
+			break;
+		case '6':
+			if (family != PF_UNSPEC)
+				usage();
+			family = PF_INET6;
+			break;
+		case 'b':
+			ull = get_bytes(optarg);
+			if (ull > INT_MAX)
+				errx(1,
+				    "invalid socket buffer size: %s\n", optarg);
+			bufsize = ull;
+			break;
+		case 'n':
+			xmitsize = get_bytes(optarg);
+			if (xmitsize > KTTCP_MAX_XMIT)
+				xmitsize = KTTCP_MAX_XMIT;
+			xmitset = 1;
+			break;
+		case 'p':
+			portstr = optarg;
+			break;
+		case 'q':
+			verbose = 0;
+			break;
+		case 'r':
+			if (cmd != 0)
+				usage();
+			cmd = KTTCP_IO_RECV;
+			break;
+		case 't':
+			if (cmd != 0)
+				usage();
+			cmd = KTTCP_IO_SEND;
+			break;
+		case 'v':
+			verbose = 2;
+			break;
+		case '?':
+		default:
+			usage();
+		}
+	}
+	if (cmd == 0)
+		usage();
+
+	argc -= optind;
+	argv += optind;
+
+	if (cmd == KTTCP_IO_SEND) {
+		if (xmitsize <= 0 || argc < 1)
+			usage();
+		host = argv[0];
+		ai_flag = 0;
+	} else {
+		if (xmitset == 0)
+			xmitsize = KTTCP_MAX_XMIT;
+		host = NULL;
+		ai_flag = AI_PASSIVE;
+	}
+
+	if ((kfd = open(KTTCP_DEVICE, O_RDWR, 666)) == -1)
+		err(2, "open %s", KTTCP_DEVICE);
+
+	memset(&hints, 0, sizeof hints);
+	hints.ai_flags = ai_flag;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_family = family;
+	error = getaddrinfo(host, portstr, &hints, &addr);
+
+	if (error != 0)
+		errx(2, "%s", gai_strerror(error));
+
+	s = -1;
+	for (res = addr; res != NULL; res = res->ai_next) {
+		s = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+		if (s >= 0)
+			break;
+	}
+	if (res == NULL)
+		err(2, "can't create socket");
+
+	printf("kttcp: socket buffer size: %d\n", bufsize);
+
+	if (cmd == KTTCP_IO_SEND) {
+		if (connect(s, res->ai_addr, res->ai_addrlen) < 0)
+			err(2, "connect");
+		if (verbose) {
+			getnameinfo(res->ai_addr, res->ai_addrlen,
+			    connecthost, sizeof connecthost, NULL, 0,
+			    NI_NUMERICHOST);
+			printf("kttcp: connected to %s\n", connecthost);
+		}
+		if (setsockopt(s, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof (int))
+		    < 0)
+			err(2, "setsockopt sndbuf");
+		kio.kio_socket = s;
+	} else {
+		if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one,
+		    sizeof (int)) < 0)
+			err(2, "setsockopt reuseaddr");
+		if (bind(s, res->ai_addr, res->ai_addrlen) < 0)
+			err(2, "bind");
+		if (listen(s, 1) < 0)
+			err(2, "listen");
+		if (verbose)
+			printf("kttcp: listening on port %s\n", portstr);
+		slen = sizeof ss;
+		s2 = accept(s, (struct sockaddr *)&ss, &slen);
+		if (s2 < 0)
+			err(2, "accept");
+		if (verbose) {
+			getnameinfo((struct sockaddr *)&ss, ss.ss_len,
+			    connecthost, sizeof connecthost, NULL, 0,
+			    NI_NUMERICHOST);
+			printf("kttcp: connect from %s\n", connecthost);
+		}
+		if (setsockopt(s2, SOL_SOCKET, SO_RCVBUF, &bufsize,
+		    sizeof (int)) < 0)
+			err(2, "setsockopt rcvbuf");
+		kio.kio_socket = s2;
+	}
+
+	kio.kio_totalsize = xmitsize;
+
+	getrusage(RUSAGE_SELF, &rustart);
+	if (ioctl(kfd, cmd, &kio) == -1)
+		err(2, "kttcp i/o command");
+	getrusage(RUSAGE_SELF, &ruend);
+
+	usecs = (unsigned long long)kio.kio_elapsed.tv_sec * 1000000;
+	usecs += kio.kio_elapsed.tv_usec;
+
+	bytespersec = kio.kio_bytesdone * 1000000LL / usecs;
+	bitspersec = bytespersec * NBBY;
+	printf("kttcp: %llu bytes in %ld.%03ld real seconds ==> %llu bytes/sec\n",
+	    kio.kio_bytesdone, kio.kio_elapsed.tv_sec,
+	    kio.kio_elapsed.tv_usec / 1000, bytespersec);
+	if (verbose > 1) {
+		timersub(&ruend.ru_stime, &rustart.ru_stime, &tvtmp);
+		bytespersec = kio.kio_bytesdone * 1000000LL /
+		    (tvtmp.tv_sec * 1000000ULL + tvtmp.tv_usec);
+		printf("kttcp: %llu bytes in %ld.%03ld CPU seconds ==> %llu bytes/CPU sec\n",
+		    kio.kio_bytesdone, tvtmp.tv_sec, tvtmp.tv_usec / 1000, bytespersec);
+	}
+	printf("       %g (%g) Megabits/sec\n",
+	    ((double) bitspersec / 1024.0) / 1024.0,
+	    ((double) bitspersec / 1000.0) / 1000.0);
+
+	timersub(&ruend.ru_utime, &rustart.ru_utime, &tvtmp);
+	/* XXX
+	 * sometimes, this ends up as -1 * hz!?
+	 */
+	if (tvtmp.tv_sec < 0)
+		tvtmp.tv_sec = tvtmp.tv_usec = 0;
+	printf("  %ld.%02lduser", tvtmp.tv_sec, tvtmp.tv_usec / 10000);
+	ull = tvtmp.tv_sec * 1000000ULL + tvtmp.tv_usec;
+
+	timersub(&ruend.ru_stime, &rustart.ru_stime, &tvtmp);
+	printf(" %ld.%02ldsys", tvtmp.tv_sec, tvtmp.tv_usec / 10000);
+	ull += tvtmp.tv_sec * 1000000ULL + tvtmp.tv_usec;
+
+	printf(" %lld.%lldreal", usecs / 1000000, (usecs % 1000000) / 10000);
+	printf(" %lld%%", ull * 100 / usecs);
+	printf("\n");
+
+
+	close(kio.kio_socket);
+	if (cmd == KTTCP_IO_RECV)
+		close(s);
+	close(kfd);
+	freeaddrinfo(addr);
+
+	return 0;
+}
--- a/tools/tools/kttcp/sys/Makefile
+++ b/tools/tools/kttcp/sys/Makefile
@ -0,0 +1,8 @@
+# $FreeBSD$
+
+KMOD	= kttcp
+SRCS	= kttcp.c
+SRCS   += device_if.h
+MFILES	= kern/device_if.m
+
+.include <bsd.kmod.mk>
--- a/tools/tools/kttcp/sys/kttcp.c
+++ b/tools/tools/kttcp/sys/kttcp.c
@ -0,0 +1,772 @@
+/*	$FreeBSD$	*/
+/*	$NetBSD: kttcp.c,v 1.3 2002/07/03 19:36:52 thorpej Exp $	*/
+
+/*
+ * Copyright (c) 2002 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden and Jason R. Thorpe for
+ * Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed for the NetBSD Project by
+ *	Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * kttcp.c --
+ *
+ *	This module provides kernel support for testing network
+ *	throughput from the perspective of the kernel.  It is
+ *	similar in spirit to the classic ttcp network benchmark
+ *	program, the main difference being that with kttcp, the
+ *	kernel is the source and sink of the data.
+ *
+ *	Testing like this is useful for a few reasons:
+ *
+ *	1. This allows us to know what kind of performance we can
+ *	   expect from network applications that run in the kernel
+ *	   space, such as the NFS server or the NFS client.  These
+ *	   applications don't have to move the data to/from userspace,
+ *	   and so benchmark programs which run in userspace don't
+ *	   give us an accurate model.
+ *
+ *	2. Since data received is just thrown away, the receiver
+ *	   is very fast.  This can provide better exercise for the
+ *	   sender at the other end.
+ *
+ *	3. Since the NetBSD kernel currently uses a run-to-completion
+ *	   scheduling model, kttcp provides a benchmark model where
+ *	   preemption of the benchmark program is not an issue.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/protosw.h>
+#include <sys/socketvar.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+#include <sys/resourcevar.h>
+#include <sys/proc.h>
+
+#include <dev/kttcp/kttcpio.h>
+
+#ifndef timersub
+#define timersub(tvp, uvp, vvp)						\
+	do {								\
+		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
+		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
+		if ((vvp)->tv_usec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_usec += 1000000;			\
+		}							\
+	} while (0)
+#endif
+
+static int kttcp_send(struct thread *p, struct kttcp_io_args *);
+static int kttcp_recv(struct thread *p, struct kttcp_io_args *);
+static int kttcp_sosend(struct socket *, unsigned long long,
+			unsigned long long *, struct thread *, int);
+static int kttcp_soreceive(struct socket *, unsigned long long,
+			   unsigned long long *, struct thread *, int *);
+
+static d_open_t		kttcpopen;
+static d_ioctl_t	kttcpioctl;
+
+static struct cdevsw kttcp_cdevsw = {
+	.d_open =	kttcpopen,
+	.d_ioctl =	kttcpioctl,
+	.d_name =	"kttcp",
+	.d_maj =	MAJOR_AUTO,
+};
+
+static int
+kttcpopen(dev_t dev, int flag, int mode, struct thread *td)
+{
+	/* Always succeeds. */
+	return (0);
+}
+
+static int
+kttcpioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+	int error;
+
+	if ((flag & FWRITE) == 0)
+		return EPERM;
+
+	switch (cmd) {
+	case KTTCP_IO_SEND:
+		error = kttcp_send(td, (struct kttcp_io_args *) data);
+		break;
+
+	case KTTCP_IO_RECV:
+		error = kttcp_recv(td, (struct kttcp_io_args *) data);
+		break;
+
+	default:
+		return EINVAL;
+	}
+
+	return error;
+}
+
+static int
+kttcp_send(struct thread *td, struct kttcp_io_args *kio)
+{
+	struct file *fp;
+	int error;
+	struct timeval t0, t1;
+	unsigned long long len = 0;
+	unsigned long long done;
+
+	if (kio->kio_totalsize >= KTTCP_MAX_XMIT)
+		return EINVAL;
+
+	error = fget(td, kio->kio_socket, &fp);
+	if (error != 0)
+		return error;
+	mtx_lock(&Giant);
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return EBADF;
+	}
+	if (fp->f_type == DTYPE_SOCKET) {
+		len = kio->kio_totalsize;
+		microtime(&t0);
+		do {
+			error = kttcp_sosend((struct socket *)fp->f_data, len,
+			    &done, td, 0);
+			len -= done;
+		} while (error == 0 && len > 0);
+		microtime(&t1);
+	} else
+		error = EFTYPE;
+	fdrop(fp, td);
+	mtx_unlock(&Giant);
+	if (error != 0)
+		return error;
+	timersub(&t1, &t0, &kio->kio_elapsed);
+
+	kio->kio_bytesdone = kio->kio_totalsize - len;
+
+	return 0;
+}
+
+static int
+kttcp_recv(struct thread *td, struct kttcp_io_args *kio)
+{
+	struct file *fp;
+	int error;
+	struct timeval t0, t1;
+	unsigned long long len = 0;
+	unsigned long long done;
+
+	if (kio->kio_totalsize > KTTCP_MAX_XMIT)
+		return EINVAL;
+
+	error = fget(td, kio->kio_socket, &fp);
+	if (error != 0)
+		return error;
+	mtx_lock(&Giant);
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
+		mtx_unlock(&Giant);
+		return EBADF;
+	}
+	if (fp->f_type == DTYPE_SOCKET) {
+		len = kio->kio_totalsize;
+		microtime(&t0);
+		do {
+			error = kttcp_soreceive((struct socket *)fp->f_data,
+			    len, &done, td, NULL);
+			len -= done;
+		} while (error == 0 && len > 0 && done > 0);
+		microtime(&t1);
+		if (error == EPIPE)
+			error = 0;
+	} else
+		error = EFTYPE;
+	fdrop(fp, td);
+	mtx_unlock(&Giant);
+	if (error != 0)
+		return error;
+	timersub(&t1, &t0, &kio->kio_elapsed);
+
+	kio->kio_bytesdone = kio->kio_totalsize - len;
+
+	return 0;
+}
+
+#define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+
+/*
+ * Slightly changed version of sosend()
+ */
+int
+kttcp_sosend(struct socket *so, unsigned long long slen,
+	     unsigned long long *done, struct thread *td, int flags)
+{
+	struct mbuf **mp, *m, *top;
+	long space, len, mlen;
+	int error, s, dontroute, atomic;
+	long long resid;
+
+	atomic = sosendallatonce(so);
+	resid = slen;
+	top = NULL;
+	/*
+	 * In theory resid should be unsigned.
+	 * However, space must be signed, as it might be less than 0
+	 * if we over-committed, and we must use a signed comparison
+	 * of space and resid.  On the other hand, a negative resid
+	 * causes us to loop sending 0-length segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+	    (so->so_proto->pr_flags & PR_ATOMIC);
+	if (td)
+		td->td_proc->p_stats->p_ru.ru_msgsnd++;
+#define	snderr(errno)	{ error = errno; splx(s); goto release; }
+
+restart:
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	do {
+		s = splnet();
+		if (so->so_state & SS_CANTSENDMORE)
+			snderr(EPIPE);
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			splx(s);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0) {
+			/*
+			 * `sendto' and `sendmsg' is allowed on a connection-
+			 * based socket if it supports implied connect.
+			 * Return ENOTCONN if not connected and no address is
+			 * supplied.
+			 */
+			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+				    !(resid == 0))
+					snderr(ENOTCONN);
+			} else
+				snderr(EDESTADDRREQ);
+		}
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024;
+		if (atomic && resid > so->so_snd.sb_hiwat)
+			snderr(EMSGSIZE);
+		if (space < resid && (atomic || space < so->so_snd.sb_lowat)) {
+			if (so->so_state & SS_NBIO)
+				snderr(EWOULDBLOCK);
+			sbunlock(&so->so_snd);
+			error = sbwait(&so->so_snd);
+			splx(s);
+			if (error)
+				goto out;
+			goto restart;
+		}
+		splx(s);
+		mp = &top;
+		do {
+			do {
+				if (top == 0) {
+					MGETHDR(m, M_WAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+					mlen = MHLEN;
+					m->m_pkthdr.len = 0;
+					m->m_pkthdr.rcvif = (struct ifnet *)0;
+				} else {
+					MGET(m, M_WAIT, MT_DATA);
+					if (m == NULL) {
+						error = ENOBUFS;
+						goto release;
+					}
+					mlen = MLEN;
+				}
+				if (resid >= MINCLSIZE) {
+					MCLGET(m, M_WAIT);
+					if ((m->m_flags & M_EXT) == 0)
+						goto nopages;
+					mlen = MCLBYTES;
+					len = min(min(mlen, resid), space);
+				} else {
+	nopages:
+					len = min(min(mlen, resid), space);
+					/*
+					 * For datagram protocols, leave room
+					 * for protocol headers in first mbuf.
+					 */
+					if (atomic && top == 0 && len < mlen)
+						MH_ALIGN(m, len);
+				}
+				space -= len;
+				resid -= len;
+				m->m_len = len;
+				*mp = m;
+				top->m_pkthdr.len += len;
+				if (error)
+					goto release;
+				mp = &m->m_next;
+				if (resid <= 0) {
+					if (flags & MSG_EOR)
+						top->m_flags |= M_EOR;
+					break;
+				}
+			} while (space > 0 && atomic);
+			if (dontroute)
+				so->so_options |= SO_DONTROUTE;
+			s = splnet();				/* XXX */
+			/*
+			 * XXX all the SS_CANTSENDMORE checks previously
+			 * done could be out of date.  We could have recieved
+			 * a reset packet in an interrupt or maybe we slept
+			 * while doing page faults in uiomove() etc. We could
+			 * probably recheck again inside the splnet() protection
+			 * here, but there are probably other places that this
+			 * also happens.  We must rethink this.
+			 */
+			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			    (flags & MSG_OOB) ? PRUS_OOB :
+			    /*
+			     * If the user set MSG_EOF, the protocol
+			     * understands this flag and nothing left to
+			     * send then use PRU_SEND_EOF instead of PRU_SEND.
+			     */
+			    ((flags & MSG_EOF) &&
+			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			     (resid <= 0)) ?
+				    PRUS_EOF :
+			    /* If there is more to send set PRUS_MORETOCOME */
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, NULL, NULL, td);
+			splx(s);
+			if (dontroute)
+				so->so_options &= ~SO_DONTROUTE;
+			top = 0;
+			mp = &top;
+			if (error)
+				goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top)
+		m_freem(top);
+	*done = slen - resid;
+	return (error);
+}
+
+int
+kttcp_soreceive(struct socket *so, unsigned long long slen,
+		unsigned long long *done, struct thread *td, int *flagsp)
+{
+	struct mbuf *m, **mp;
+	int flags, len, error, s, offset;
+	struct protosw *pr;
+	struct mbuf *nextrecord;
+	int moff, type;
+	long long orig_resid, resid;
+
+	pr = so->so_proto;
+	mp = NULL;
+	type = 0;
+	resid = orig_resid = slen;
+	if (flagsp)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB) {
+		m = m_get(M_WAIT, MT_DATA);
+		if (m == NULL)
+			return (ENOBUFS);
+		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+		if (error)
+			goto bad;
+		do {
+			resid -= min(resid, m->m_len);
+			m = m_free(m);
+		} while (resid && error == 0 && m);
+bad:
+		if (m)
+			m_freem(m);
+		return (error);
+	}
+	if (mp)
+		*mp = (struct mbuf *)0;
+	if (so->so_state & SS_ISCONFIRMING && resid)
+		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
+
+restart:
+	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (error)
+		return (error);
+	s = splnet();
+
+	m = so->so_rcv.sb_mb;
+	/*
+	 * If we have less data than requested, block awaiting more
+	 * (subject to any timeout) if:
+	 *   1. the current count is less than the low water mark, or
+	 *   2. MSG_WAITALL is set, and it is possible to do the entire
+	 *	receive operation at once if we block (resid <= hiwat).
+	 *   3. MSG_DONTWAIT is not set
+	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
+	 * we have to do the receive in sections, and thus risk returning
+	 * a short count if a timeout or signal occurs after we start.
+	 */
+	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
+	    so->so_rcv.sb_cc < resid) &&
+	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
+	    ((flags & MSG_WAITALL) && resid <= so->so_rcv.sb_hiwat)) &&
+	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
+		KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
+		if (so->so_error) {
+			if (m)
+				goto dontblock;
+			error = so->so_error;
+			if ((flags & MSG_PEEK) == 0)
+				so->so_error = 0;
+			goto release;
+		}
+		if (so->so_state & SS_CANTRCVMORE) {
+			if (m)
+				goto dontblock;
+			else
+				goto release;
+		}
+		for (; m; m = m->m_next)
+			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
+				m = so->so_rcv.sb_mb;
+				goto dontblock;
+			}
+		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			error = ENOTCONN;
+			goto release;
+		}
+		if (resid == 0)
+			goto release;
+		if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
+			error = EWOULDBLOCK;
+			goto release;
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
+		sbunlock(&so->so_rcv);
+		error = sbwait(&so->so_rcv);
+		splx(s);
+		if (error)
+			return (error);
+		goto restart;
+	}
+dontblock:
+	/*
+	 * On entry here, m points to the first record of the socket buffer.
+	 * While we process the initial mbufs containing address and control
+	 * info, we save a copy of m->m_nextpkt into nextrecord.
+	 */
+	if (td)
+		td->td_proc->p_stats->p_ru.ru_msgrcv++;
+	KASSERT(m == so->so_rcv.sb_mb, ("receive 1b"));
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+	nextrecord = m->m_nextpkt;
+	if (pr->pr_flags & PR_ADDR) {
+		KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
+		orig_resid = 0;
+		if (flags & MSG_PEEK) {
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m_free(m);
+			m = so->so_rcv.sb_mb;
+		}
+	}
+	while (m && m->m_type == MT_CONTROL && error == 0) {
+		if (flags & MSG_PEEK) {
+			m = m->m_next;
+		} else {
+			sbfree(&so->so_rcv, m);
+			so->so_rcv.sb_mb = m_free(m);
+			m = so->so_rcv.sb_mb;
+		}
+	}
+
+	/*
+	 * If m is non-NULL, we have some data to read.  From now on,
+	 * make sure to keep sb_lastrecord consistent when working on
+	 * the last packet on the chain (nextrecord == NULL) and we
+	 * change m->m_nextpkt.
+	 */
+	if (m) {
+		if ((flags & MSG_PEEK) == 0) {
+			m->m_nextpkt = nextrecord;
+			/*
+			 * If nextrecord == NULL (this is a single chain),
+			 * then sb_lastrecord may not be valid here if m
+			 * was changed earlier.
+			 */
+			if (nextrecord == NULL) {
+				KASSERT(so->so_rcv.sb_mb == m, ("receive 1c"));
+				so->so_rcv.sb_lastrecord = m;
+			}
+		}
+		type = m->m_type;
+		if (type == MT_OOBDATA)
+			flags |= MSG_OOB;
+	} else {
+		if ((flags & MSG_PEEK) == 0) {
+			KASSERT(so->so_rcv.sb_mb == m, ("receive 1d"));
+			so->so_rcv.sb_mb = nextrecord;
+			SB_EMPTY_FIXUP(&so->so_rcv);
+		}
+	}
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+
+	moff = 0;
+	offset = 0;
+	while (m && resid > 0 && error == 0) {
+		if (m->m_type == MT_OOBDATA) {
+			if (type != MT_OOBDATA)
+				break;
+		} else if (type == MT_OOBDATA)
+			break;
+		else
+		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+			("receive 3"));
+		so->so_state &= ~SS_RCVATMARK;
+		len = resid;
+		if (so->so_oobmark && len > so->so_oobmark - offset)
+			len = so->so_oobmark - offset;
+		if (len > m->m_len - moff)
+			len = m->m_len - moff;
+		/*
+		 * If mp is set, just pass back the mbufs.
+		 * Otherwise copy them out via the uio, then free.
+		 * Sockbuf must be consistent here (points to current mbuf,
+		 * it points to next record) when we drop priority;
+		 * we must note any additions to the sockbuf when we
+		 * block interrupts again.
+		 */
+		resid -= len;
+		if (len == m->m_len - moff) {
+			if (m->m_flags & M_EOR)
+				flags |= MSG_EOR;
+			if (flags & MSG_PEEK) {
+				m = m->m_next;
+				moff = 0;
+			} else {
+				nextrecord = m->m_nextpkt;
+				sbfree(&so->so_rcv, m);
+				if (mp) {
+					*mp = m;
+					mp = &m->m_next;
+					so->so_rcv.sb_mb = m = m->m_next;
+					*mp = (struct mbuf *)0;
+				} else {
+					so->so_rcv.sb_mb = m = m_free(m);
+				}
+				/*
+				 * If m != NULL, we also know that
+				 * so->so_rcv.sb_mb != NULL.
+				 */
+				KASSERT(so->so_rcv.sb_mb == m, ("receive 3a"));
+				if (m) {
+					m->m_nextpkt = nextrecord;
+					if (nextrecord == NULL)
+						so->so_rcv.sb_lastrecord = m;
+				} else {
+					so->so_rcv.sb_mb = nextrecord;
+					SB_EMPTY_FIXUP(&so->so_rcv);
+				}
+				SBLASTRECORDCHK(&so->so_rcv);
+				SBLASTMBUFCHK(&so->so_rcv);
+			}
+		} else {
+			if (flags & MSG_PEEK)
+				moff += len;
+			else {
+				if (mp)
+					*mp = m_copym(m, 0, len, M_WAIT);
+				m->m_data += len;
+				m->m_len -= len;
+				so->so_rcv.sb_cc -= len;
+			}
+		}
+		if (so->so_oobmark) {
+			if ((flags & MSG_PEEK) == 0) {
+				so->so_oobmark -= len;
+				if (so->so_oobmark == 0) {
+					so->so_state |= SS_RCVATMARK;
+					break;
+				}
+			} else {
+				offset += len;
+				if (offset == so->so_oobmark)
+					break;
+			}
+		}
+		if (flags & MSG_EOR)
+			break;
+		/*
+		 * If the MSG_WAITALL flag is set (for non-atomic socket),
+		 * we must not quit until "uio->uio_resid == 0" or an error
+		 * termination.  If a signal/timeout occurs, return
+		 * with a short count but without error.
+		 * Keep sockbuf locked against other readers.
+		 */
+		while (flags & MSG_WAITALL && m == 0 && resid > 0 &&
+		    !sosendallatonce(so) && !nextrecord) {
+			if (so->so_error || so->so_state & SS_CANTRCVMORE)
+				break;
+			/*
+			 * The window might have closed to zero, make
+			 * sure we send an ack now that we've drained
+			 * the buffer or we might end up blocking until
+			 * the idle takes over (5 seconds).
+			 */
+			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+			SBLASTRECORDCHK(&so->so_rcv);
+			SBLASTMBUFCHK(&so->so_rcv);
+			error = sbwait(&so->so_rcv);
+			if (error) {
+				sbunlock(&so->so_rcv);
+				splx(s);
+				return (0);
+			}
+			m = so->so_rcv.sb_mb;
+			if (m)
+				nextrecord = m->m_nextpkt;
+		}
+	}
+
+	if (m && pr->pr_flags & PR_ATOMIC) {
+		flags |= MSG_TRUNC;
+		if ((flags & MSG_PEEK) == 0)
+			(void) sbdroprecord(&so->so_rcv);
+	}
+	if ((flags & MSG_PEEK) == 0) {
+		if (m == 0) {
+			/*
+			 * First part is an inline SB_EMPTY_FIXUP().  Second
+			 * part makes sure sb_lastrecord is up-to-date if
+			 * there is still data in the socket buffer.
+			 */
+			so->so_rcv.sb_mb = nextrecord;
+			if (so->so_rcv.sb_mb == NULL) {
+				so->so_rcv.sb_mbtail = NULL;
+				so->so_rcv.sb_lastrecord = NULL;
+			} else if (nextrecord->m_nextpkt == NULL)
+				so->so_rcv.sb_lastrecord = nextrecord;
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
+		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+	}
+	if (orig_resid == resid && orig_resid &&
+	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
+		sbunlock(&so->so_rcv);
+		splx(s);
+		goto restart;
+	}
+
+	if (flagsp)
+		*flagsp |= flags;
+release:
+	sbunlock(&so->so_rcv);
+	splx(s);
+	*done = slen - resid;
+#if 0
+	printf("soreceive: error %d slen %llu resid %lld\n", error, slen, resid);
+#endif
+	return (error);
+}
+
+static dev_t kttcp_dev;
+
+/*
+ * Initialization code, both for static and dynamic loading.
+ */
+static int
+kttcpdev_modevent(module_t mod, int type, void *unused)
+{
+	switch (type) {
+	case MOD_LOAD:
+		kttcp_dev = make_dev(&kttcp_cdevsw, 0,
+				      UID_ROOT, GID_WHEEL, 0666,
+				      "kttcp");
+		return 0;
+	case MOD_UNLOAD:
+		/*XXX disallow if active sessions */
+		destroy_dev(kttcp_dev);
+		return 0;
+	}
+	return EINVAL;
+}
+
+static moduledata_t kttcpdev_mod = {
+	"kttcpdev",
+	kttcpdev_modevent,
+	0
+};
+MODULE_VERSION(kttcpdev, 1);
+DECLARE_MODULE(kttcpdev, kttcpdev_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
--- a/tools/tools/kttcp/sys/kttcpio.h
+++ b/tools/tools/kttcp/sys/kttcpio.h
@ -0,0 +1,59 @@
+/*	$FreeBSD$	*/
+/*	$NetBSD$	*/
+
+/*
+ * Copyright (c) 2002 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden and Jason R. Thorpe for
+ * Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed for the NetBSD Project by
+ *	Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _DEV_KTTCPIO_H_
+#define	_DEV_KTTCPIO_H_
+
+#include <sys/ioccom.h>
+#include <sys/time.h>
+
+struct kttcp_io_args {
+	unsigned long long kio_totalsize;/* i/o total size (IN) */
+	unsigned long long kio_bytesdone;/* i/o actually completed (OUT) */
+	struct timeval kio_elapsed;	 /* elapsed time (OUT) */
+	int	kio_socket;		 /* socket to use for i/o (IN) */
+	int	kio_protovers;		 /* KTTCP protocol version */
+};
+
+#define	KTTCP_IO_SEND		_IOWR('K', 0, struct kttcp_io_args)
+#define	KTTCP_IO_RECV		_IOWR('K', 1, struct kttcp_io_args)
+
+#define	KTTCP_MAX_XMIT		0x7fffffffLL /* XXX can't handle > 31 bits */
+
+#endif /* _DEV_KTTCPIO_H_ */