Compute two new metrics. Disk load, the average number of transactions

we have queued up normaliazed to the queue size. Also compute buckets
of latency to help compute, in userland, estimates of Median, P90, P95
and P99 values.

Sponsored by: Netflix, Inc
This commit is contained in:
Warner Losh 2016-09-30 17:49:04 +00:00
parent 848a049b71
commit cf3ec15167
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=306514

View file

@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_periph.h>
#include <cam/cam_xpt_periph.h>
#include <cam/cam_xpt_internal.h>
#include <cam/cam_iosched.h>
#include <ddb/ddb.h>
@ -73,14 +75,31 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD,
&do_dynamic_iosched, 1,
"Enable Dynamic I/O scheduler optimizations.");
/*
* For an EMA, with an alpha of alpha, we know
* alpha = 2 / (N + 1)
* or
* N = 1 + (2 / alpha)
* where N is the number of samples that 86% of the current
* EMA is derived from.
*
* So we invent[*] alpha_bits:
* alpha_bits = -log_2(alpha)
* alpha = 2^-alpha_bits
* So
* N = 1 + 2^(alpha_bits + 1)
*
* The default 9 gives a 1025 lookback for 86% of the data.
* For a brief intro: https://en.wikipedia.org/wiki/Moving_average
*
* [*] Steal from the load average code and many other places.
*/
static int alpha_bits = 9;
TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
&alpha_bits, 1,
"Bits in EMA's alpha.");
struct iop_stats;
struct cam_iosched_softc;
@ -208,11 +227,17 @@ struct iop_stats
/*
* Statistics on different bits of the process.
*/
/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
/* Exp Moving Average, see alpha_bits for more details */
sbintime_t ema;
sbintime_t emss; /* Exp Moving sum of the squares */
sbintime_t sd; /* Last computed sd */
uint32_t state_flags;
#define IOP_RATE_LIMITED 1u
#define LAT_BUCKETS 12 /* < 1ms < 2ms ... 512ms < 1024ms > 1024ms */
uint64_t latencies[LAT_BUCKETS];
struct cam_iosched_softc *softc;
};
@ -258,6 +283,7 @@ struct cam_iosched_softc
int read_bias; /* Read bias setting */
int current_read_bias; /* Current read bias state */
int total_ticks;
int load; /* EMA of 'load average' of disk / 2^16 */
struct bio_queue_head write_queue;
struct iop_stats read_stats, write_stats, trim_stats;
@ -509,6 +535,7 @@ cam_iosched_ticker(void *arg)
{
struct cam_iosched_softc *isc = arg;
sbintime_t now, delta;
int pending;
callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
@ -525,6 +552,36 @@ cam_iosched_ticker(void *arg)
cam_iosched_schedule(isc, isc->periph);
/*
* isc->load is an EMA of the pending I/Os at each tick. The number of
* pending I/Os is the sum of the I/Os queued to the hardware, and those
* in the software queue that could be queued to the hardware if there
* were slots.
*
* ios_stats.pending is a count of requests in the SIM right now for
* each of these types of I/O. So the total pending count is the sum of
* these I/Os and the sum of the queued I/Os still in the software queue
* for those operations that aren't being rate limited at the moment.
*
* The reason for the rate limiting bit is because those I/Os
* aren't part of the software queued load (since we could
* give them to hardware, but choose not to).
*
* Note: due to a bug in counting pending TRIM in the device, we
* don't include them in this count. We count each BIO_DELETE in
* the pending count, but the periph drivers collapse them down
* into one TRIM command. That one trim command gets the completion
* so the counts get off.
*/
pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */;
pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued +
!!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* +
!!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ;
pending <<= 16;
pending /= isc->periph->path->device->ccbq.total_openings;
isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */
isc->total_ticks++;
}
@ -610,7 +667,7 @@ cam_iosched_cl_maybe_steer(struct control_loop *clp)
if (isc->write_stats.current > isc->write_stats.max)
isc->write_stats.current = isc->write_stats.max;
if (old != isc->write_stats.current && iosched_debug)
printf("Steering write from %d kBps to %d kBps due to latency of %jdms\n",
printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n",
old, isc->write_stats.current,
(uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32);
break;
@ -714,7 +771,7 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
char buf[16];
struct iop_stats *ios;
struct cam_iosched_softc *isc;
int value, i, error, cantick;
int value, i, error;
const char *p;
ios = arg1;
@ -742,21 +799,9 @@ cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS)
cam_periph_unlock(isc->periph);
return error;
}
cantick = !!limsw[isc->read_stats.limiter].l_tick +
!!limsw[isc->write_stats.limiter].l_tick +
!!limsw[isc->trim_stats.limiter].l_tick +
1; /* Control loop requires it */
if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) {
if (cantick == 0) {
callout_stop(&isc->ticker);
isc->flags &= ~CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
}
} else {
if (cantick != 0) {
callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
}
}
/* Note: disk load averate requires ticker to be always running */
callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE;
cam_periph_unlock(isc->periph);
return 0;
@ -821,6 +866,25 @@ cam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS)
return 0;
}
static int
cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS)
{
int i, error;
struct sbuf sb;
uint64_t *latencies;
latencies = arg1;
sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req);
for (i = 0; i < LAT_BUCKETS - 1; i++)
sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]);
sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]);
error = sbuf_finish(&sb);
sbuf_delete(&sb);
return (error);
}
static void
cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name)
{
@ -884,6 +948,11 @@ cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stat
&ios->current, 0,
"current resource");
SYSCTL_ADD_PROC(ctx, n,
OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD,
&ios->latencies, 0,
cam_iosched_sysctl_latencies, "A",
"Array of power of 2 latency from 1ms to 1.024s");
}
static void
@ -1051,6 +1120,11 @@ void cam_iosched_sysctl_init(struct cam_iosched_softc *isc,
OID_AUTO, "total_ticks", CTLFLAG_RD,
&isc->total_ticks, 0,
"Total number of ticks we've done");
SYSCTL_ADD_INT(ctx, n,
OID_AUTO, "load", CTLFLAG_RD,
&isc->load, 0,
"scaled load average / 100");
#endif
}
@ -1100,6 +1174,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
if (iosched_debug)
printf("Reads present and current_read_bias is %d queued writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued);
isc->current_read_bias--;
/* We're not limiting writes, per se, just doing reads first */
return NULL;
}
@ -1109,6 +1184,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) {
if (iosched_debug)
printf("Can't write because limiter says no.\n");
isc->write_stats.state_flags |= IOP_RATE_LIMITED;
return NULL;
}
@ -1125,6 +1201,7 @@ cam_iosched_get_write(struct cam_iosched_softc *isc)
}
if (iosched_debug > 9)
printf("HWQ : %p %#x\n", bp, bp->bio_cmd);
isc->write_stats.state_flags &= ~IOP_RATE_LIMITED;
return bp;
}
#endif
@ -1224,14 +1301,17 @@ cam_iosched_next_bio(struct cam_iosched_softc *isc)
#ifdef CAM_IOSCHED_DYNAMIC
/*
* For the netflix scheduler, bio_queue is only for reads, so enforce
* For the dynamic scheduler, bio_queue is only for reads, so enforce
* the limits here. Enforce only for reads.
*/
if (do_dynamic_iosched) {
if (bp->bio_cmd == BIO_READ &&
cam_iosched_limiter_iop(&isc->read_stats, bp) != 0)
cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) {
isc->read_stats.state_flags |= IOP_RATE_LIMITED;
return NULL;
}
}
isc->read_stats.state_flags &= ~IOP_RATE_LIMITED;
#endif
bioq_remove(&isc->bio_queue, bp);
#ifdef CAM_IOSCHED_DYNAMIC
@ -1478,13 +1558,41 @@ mul(uint64_t a, uint64_t b)
return ((ah * bh) << 32) + al * bh + ah * bl + ((al * bl) >> 32);
}
static sbintime_t latencies[] = {
SBT_1MS << 0,
SBT_1MS << 1,
SBT_1MS << 2,
SBT_1MS << 3,
SBT_1MS << 4,
SBT_1MS << 5,
SBT_1MS << 6,
SBT_1MS << 7,
SBT_1MS << 8,
SBT_1MS << 9,
SBT_1MS << 10
};
static void
cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
{
sbintime_t y, yy;
uint64_t var;
int i;
/*
/*
* Keep counts for latency. We do it by power of two buckets.
* This helps us spot outlier behavior obscured by averages.
*/
for (i = 0; i < LAT_BUCKETS - 1; i++) {
if (sim_latency < latencies[i]) {
iop->latencies[i]++;
break;
}
}
if (i == LAT_BUCKETS - 1)
iop->latencies[i]++; /* Put all > 1024ms values into the last bucket. */
/*
* Classic expoentially decaying average with a tiny alpha
* (2 ^ -alpha_bits). For more info see the NIST statistical
* handbook.