Compute two new metrics. Disk load, the average number of transactions

we have queued up normaliazed to the queue size. Also compute buckets of latency to help compute, in userland, estimates of Median, P90, P95 and P99 values. Sponsored by: Netflix, Inc
svn path=/head/; revision=306514
2024-09-20 08:44:33 +00:00 · 2016-09-30 17:49:04 +00:00 · 2016-09-30 17:49:04 +00:00 · cf3ec15167 · 2020-12-20 02:59:44 +00:00
parent 848a049b71
commit cf3ec15167
1 changed files with 131 additions and 23 deletions
--- a/sys/cam/cam_iosched.c
+++ b/sys/cam/cam_iosched.c
@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/sbuf.h>
 #include <sys/sysctl.h>

 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_periph.h>
+#include <cam/cam_xpt_internal.h>
 #include <cam/cam_iosched.h>

 #include <ddb/ddb.h>
@ -73,14 +75,31 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD,
    &do_dynamic_iosched, 1,
    "Enable Dynamic I/O scheduler optimizations.");

+/*
+ * For an EMA, with an alpha of alpha, we know
+ * 	alpha = 2 / (N + 1)
+ * or
+ * 	N = 1 + (2 / alpha)
+ * where N is the number of samples that 86% of the current
+ * EMA is derived from.
+ *
+ * So we invent[*] alpha_bits:
+ *	alpha_bits = -log_2(alpha)
+ *	alpha = 2^-alpha_bits
+ * So
+ *	N = 1 + 2^(alpha_bits + 1)
+ *
+ * The default 9 gives a 1025 lookback for 86% of the data.
+ * For a brief intro: https://en.wikipedia.org/wiki/Moving_average
+ *
+ * [*] Steal from the load average code and many other places.
+ */
 static int alpha_bits = 9;
 TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
 SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
    &alpha_bits, 1,
    "Bits in EMA's alpha.");

-
-
 struct iop_stats;
 struct cam_iosched_softc;

@ -208,11 +227,17 @@ struct iop_stats
 	/*
 	 * Statistics on different bits of the process.
 	 */
-		/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
+		/* Exp Moving Average, see alpha_bits for more details */
 	sbintime_t      ema;
 	sbintime_t      emss;		/* Exp Moving sum of the squares */
 	sbintime_t      sd;		/* Last computed sd */

+	uint32_t	state_flags;
+#define IOP_RATE_LIMITED		1u
+
+#define LAT_BUCKETS 12			/* < 1ms < 2ms ... 512ms < 1024ms > 1024ms */
+	uint64_t	latencies[LAT_BUCKETS];
+
 	struct cam_iosched_softc *softc;
 };

@ -258,6 +283,7 @@ struct cam_iosched_softc
 	int		read_bias;		/* Read bias setting */
 	int		current_read_bias;	/* Current read bias state */
 	int		total_ticks;
+	int		load;			/* EMA of 'load average' of disk / 2^16 */

 	struct bio_queue_head write_queue;
 	struct iop_stats read_stats, write_stats, trim_stats;
@ -509,6 +535,7 @@ cam_iosched_ticker(void *arg)
 {
 	struct cam_iosched_softc *isc = arg;
 	sbintime_t now, delta;
+	int pending;

 	callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);

@ -525,6 +552,36 @@ cam_iosched_ticker(void *arg)

 	cam_iosched_schedule(isc, isc->periph);

+	/*
+	 * isc->load is an EMA of the pending I/Os at each tick. The number of
+	 * pending I/Os is the sum of the I/Os queued to the hardware, and those
+	 * in the software queue that could be queued to the hardware if there
+	 * were slots.
+	 *
+	 * ios_stats.pending is a count of requests in the SIM right now for
+	 * each of these types of I/O. So the total pending count is the sum of
+	 * these I/Os and the sum of the queued I/Os still in the software queue
+	 * for those operations that aren't being rate limited at the moment.
+	 *
+	 * The reason for the rate limiting bit is because those I/Os
+	 * aren't part of the software queued load (since we could
+	 * give them to hardware, but choose not to).
+	 *
+	 * Note: due to a bug in counting pending TRIM in the device, we
+	 * don't include them in this count. We count each BIO_DELETE in
+	 * the pending count, but the periph drivers collapse them down
+	 * into one TRIM command. That one trim command gets the completion
+	 * so the counts get off.
+	 */
+	pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */;
+	pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued +
+	    !!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* +
+	    !!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ;
+	pending <<= 16;
+	pending /= isc->periph->path->device->ccbq.total_openings;
+
+	isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */
+
 	isc->total_ticks++;
 }

@ -610,7 +667,7 @@ cam_iosched_cl_maybe_steer(struct control_loop *clp)
 		if (isc->write_stats.current > isc->write_stats.max)
 			isc->write_stats.current = isc->write_stats.max;
 		if (old != isc->write_stats.current && 	iosched_debug)
-			printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n",
+			printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n",
 			    old, isc->write_stats.current,
 			    (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32);
 		break;
@ -714,7 +771,7 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
 	char buf[16];
 	struct iop_stats *ios;
 	struct cam_iosched_softc *isc;
-	int value, i, error, cantick;
+	int value, i, error;
 	const char *p;
 	
 	ios = arg1;
@ -742,21 +799,9 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
 			cam_periph_unlock(isc->periph);
 			return error;
 		}
-		cantick = !!limsw[isc->read_stats.limiter].l_tick +
-		    !!limsw[isc->write_stats.limiter].l_tick +
-		    !!limsw[isc->trim_stats.limiter].l_tick +
-		    1;	/* Control loop requires it */
-		if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
-			if (cantick == 0) {
-				callout_stop(&isc->ticker);
-				isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
-			}
-		} else {
-			if (cantick != 0) {
-				callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
-				isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
-			}
-		}
+		/* Note: disk load averate requires ticker to be always running */
+		callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
+		isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;

 		cam_periph_unlock(isc->periph);
 		return 0;
@ -821,6 +866,25 @@ cam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS)
 	return 0;
 }

+static int
+cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS)
+{
+	int i, error;
+	struct sbuf sb;
+	uint64_t *latencies;
+
+	latencies = arg1;
+	sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req);
+
+	for (i = 0; i < LAT_BUCKETS - 1; i++)
+		sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]);
+	sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]);
+	error = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+	return (error);
+}
+
 static void
 cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name)
 {
@ -884,6 +948,11 @@ cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stat
 	    &ios->current, 0,
 	    "current resource");

+	SYSCTL_ADD_PROC(ctx, n,
+	    OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD,
+	    &ios->latencies, 0,
+	    cam_iosched_sysctl_latencies, "A",
+	    "Array of power of 2 latency from 1ms to 1.024s");
 }

 static void
@ -1051,6 +1120,11 @@ void cam_iosched_sysctl_init(struct cam_iosched_softc *isc,
 	    OID_AUTO, "total_ticks", CTLFLAG_RD,
 	    &isc->total_ticks, 0,
 	    "Total number of ticks we've done");
+
+	SYSCTL_ADD_INT(ctx, n,
+	    OID_AUTO, "load", CTLFLAG_RD,
+	    &isc->load, 0,
+	    "scaled load average / 100");
 #endif
 }

@ -1100,6 +1174,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
 		if (iosched_debug)
 			printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued);
 		isc->current_read_bias--;
+		/* We're not limiting writes, per se, just doing reads first */
 		return NULL;
 	}

@ -1109,6 +1184,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
 	if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) {
 		if (iosched_debug)
 			printf("Can't write because limiter says no.\n");
+		isc->write_stats.state_flags |= IOP_RATE_LIMITED;
 		return NULL;
 	}

@ -1125,6 +1201,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
 	}
 	if (iosched_debug > 9)
 		printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
+	isc->write_stats.state_flags &= ~IOP_RATE_LIMITED;
 	return bp;
 }
 #endif
@ -1224,14 +1301,17 @@ cam_iosched_next_bio(struct cam_iosched_softc *isc)

 #ifdef CAM_IOSCHED_DYNAMIC
 	/*
-	 * For the netflix scheduler, bio_queue is only for reads, so enforce
+	 * For the dynamic scheduler, bio_queue is only for reads, so enforce
 	 * the limits here. Enforce only for reads.
 	 */
 	if (do_dynamic_iosched) {
 		if (bp->bio_cmd == BIO_READ &&
-		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0)
+		    cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) {
+			isc->read_stats.state_flags |= IOP_RATE_LIMITED;
 			return NULL;
+		}
 	}
+	isc->read_stats.state_flags &= ~IOP_RATE_LIMITED;
 #endif
 	bioq_remove(&isc->bio_queue, bp);
 #ifdef CAM_IOSCHED_DYNAMIC
@ -1478,13 +1558,41 @@ mul(uint64_t a, uint64_t b)
 	return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32);
 }

+static sbintime_t latencies[] = {
+	SBT_1MS <<  0,
+	SBT_1MS <<  1,
+	SBT_1MS <<  2,
+	SBT_1MS <<  3,
+	SBT_1MS <<  4,
+	SBT_1MS <<  5,
+	SBT_1MS <<  6,
+	SBT_1MS <<  7,
+	SBT_1MS <<  8,
+	SBT_1MS <<  9,
+	SBT_1MS << 10
+};
+
 static void
 cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
 {
 	sbintime_t y, yy;
 	uint64_t var;
+	int i;

-	/* 
+	/*
+	 * Keep counts for latency. We do it by power of two buckets.
+	 * This helps us spot outlier behavior obscured by averages.
+	 */
+	for (i = 0; i < LAT_BUCKETS - 1; i++) {
+		if (sim_latency < latencies[i]) {
+			iop->latencies[i]++;
+			break;
+		}
+	}
+	if (i == LAT_BUCKETS - 1)
+		iop->latencies[i]++; 	 /* Put all > 1024ms values into the last bucket. */
+
+	/*
 	 * Classic expoentially decaying average with a tiny alpha
 	 * (2 ^ -alpha_bits). For more info see the NIST statistical
 	 * handbook.