spa: make read/write queues configurable

We are finding that as customers get larger and faster machines
(hundreds of cores, large NVMe-backed pools) they keep hitting
relatively low performance ceilings. Our profiling work almost always
finds that they're running into bottlenecks on the SPA IO taskqs.
Unfortunately there's often little we can advise at that point, because
there's very few ways to change behaviour without patching.

This commit adds two load-time parameters `zio_taskq_read` and
`zio_taskq_write` that can configure the READ and WRITE IO taskqs
directly.

This achieves two goals: it gives operators (and those that support
them) a way to tune things without requiring a custom build of OpenZFS,
which is often not possible, and it lets us easily try different config
variations in a variety of environments to inform the development of
better defaults for these kind of systems.

Because tuning the IO taskqs really requires a fairly deep understanding
of how IO in ZFS works, and generally isn't needed without a pretty
serious workload and an ability to identify bottlenecks, only minimal
documentation is provided. Its expected that anyone using this is going
to have the source code there as well.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15675
This commit is contained in:
Rob N 2023-12-21 09:17:14 +11:00 committed by GitHub
parent 957dc1037a
commit 6930ecbb75
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 312 additions and 1 deletions

View file

@ -94,6 +94,12 @@
#define param_set_max_auto_ashift_args(var) \
CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU"
#define spa_taskq_read_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A"
#define spa_taskq_write_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A"
#define fletcher_4_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A"

View file

@ -2297,6 +2297,16 @@ as the number of actual CPUs in the system divided by the
.Sy spa_num_allocators
value.
.
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
Set the queue and thread configuration for the IO read queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zio_taskq_write Ns = Ns Sy sync fixed,1,5 scale fixed,1,5 Pq charp
Set the queue and thread configuration for the IO write queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
This may slightly improve startup time on

View file

@ -173,7 +173,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@ -1211,6 +1211,292 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_taskq = NULL;
}
#ifdef _KERNEL
/*
* The READ and WRITE rows of zio_taskqs are configurable at module load time
* by setting zio_taskq_read or zio_taskq_write.
*
* Example (the defaults for READ and WRITE)
* zio_taskq_read='fixed,1,8 null scale null'
* zio_taskq_write='sync fixed,1,5 scale fixed,1,5'
*
* Each sets the entire row at a time.
*
* 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
* of threads per taskq.
*
* 'null' can only be set on the high-priority queues (queue selection for
* high-priority queues will fall back to the regular queue if the high-pri
* is NULL.
*/
static const char *const modes[ZTI_NMODES] = {
"fixed", "scale", "sync", "null"
};
/* Parse the incoming config string. Modifies cfg */
static int
spa_taskq_param_set(zio_type_t t, char *cfg)
{
int err = 0;
zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
char *next = cfg, *tok, *c;
/*
* Parse out each element from the string and fill `row`. The entire
* row has to be set at once, so any errors are flagged by just
* breaking out of this loop early.
*/
uint_t q;
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
/* `next` is the start of the config */
if (next == NULL)
break;
/* Eat up leading space */
while (isspace(*next))
next++;
if (*next == '\0')
break;
/* Mode ends at space or end of string */
tok = next;
next = strchr(tok, ' ');
if (next != NULL) *next++ = '\0';
/* Parameters start after a comma */
c = strchr(tok, ',');
if (c != NULL) *c++ = '\0';
/* Match mode string */
uint_t mode;
for (mode = 0; mode < ZTI_NMODES; mode++)
if (strcmp(tok, modes[mode]) == 0)
break;
if (mode == ZTI_NMODES)
break;
/* Invalid canary */
row[q].zti_mode = ZTI_NMODES;
/* Per-mode setup */
switch (mode) {
/*
* FIXED is parameterised: number of queues, and number of
* threads per queue.
*/
case ZTI_MODE_FIXED: {
/* No parameters? */
if (c == NULL || *c == '\0')
break;
/* Find next parameter */
tok = c;
c = strchr(tok, ',');
if (c == NULL)
break;
/* Take digits and convert */
unsigned long long nq;
if (!(isdigit(*tok)))
break;
err = ddi_strtoull(tok, &tok, 10, &nq);
/* Must succeed and also end at the next param sep */
if (err != 0 || tok != c)
break;
/* Move past the comma */
tok++;
/* Need another number */
if (!(isdigit(*tok)))
break;
/* Remember start to make sure we moved */
c = tok;
/* Take digits */
unsigned long long ntpq;
err = ddi_strtoull(tok, &tok, 10, &ntpq);
/* Must succeed, and moved forward */
if (err != 0 || tok == c || *tok != '\0')
break;
/*
* sanity; zero queues/threads make no sense, and
* 16K is almost certainly more than anyone will ever
* need and avoids silly numbers like UINT32_MAX
*/
if (nq == 0 || nq >= 16384 ||
ntpq == 0 || ntpq >= 16384)
break;
const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
row[q] = zti;
break;
}
case ZTI_MODE_SCALE: {
const zio_taskq_info_t zti = ZTI_SCALE;
row[q] = zti;
break;
}
case ZTI_MODE_SYNC: {
const zio_taskq_info_t zti = ZTI_SYNC;
row[q] = zti;
break;
}
case ZTI_MODE_NULL: {
/*
* Can only null the high-priority queues; the general-
* purpose ones have to exist.
*/
if (q != ZIO_TASKQ_ISSUE_HIGH &&
q != ZIO_TASKQ_INTERRUPT_HIGH)
break;
const zio_taskq_info_t zti = ZTI_NULL;
row[q] = zti;
break;
}
default:
break;
}
/* Ensure we set a mode */
if (row[q].zti_mode == ZTI_NMODES)
break;
}
/* Didn't get a full row, fail */
if (q < ZIO_TASKQ_TYPES)
return (SET_ERROR(EINVAL));
/* Eat trailing space */
if (next != NULL)
while (isspace(*next))
next++;
/* If there's anything left over then fail */
if (next != NULL && *next != '\0')
return (SET_ERROR(EINVAL));
/* Success! Copy it into the real config */
for (q = 0; q < ZIO_TASKQ_TYPES; q++)
zio_taskqs[t][q] = row[q];
return (0);
}
static int
spa_taskq_param_get(zio_type_t t, char *buf)
{
int pos = 0;
/* Build paramater string from live config */
const char *sep = "";
for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *zti = &zio_taskqs[t][q];
if (zti->zti_mode == ZTI_MODE_FIXED)
pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
modes[zti->zti_mode], zti->zti_count,
zti->zti_value);
else
pos += sprintf(&buf[pos], "%s%s", sep,
modes[zti->zti_mode]);
sep = " ";
}
buf[pos++] = '\n';
buf[pos] = '\0';
return (pos);
}
#ifdef __linux__
static int
spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_READ, buf));
}
static int
spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf));
}
#else
#include <sys/sbuf.h>
/*
* On FreeBSD load-time parameters can be set up before malloc() is available,
* so we have to do all the parsing work on the stack.
*/
#define SPA_TASKQ_PARAM_MAX (128)
static int
spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err = 0;
if (req->newptr == NULL) {
int len = spa_taskq_param_get(ZIO_TYPE_READ, buf);
struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
sbuf_cpy(s, buf);
err = sbuf_finish(s);
sbuf_delete(s);
return (err);
}
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
}
static int
spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err = 0;
if (req->newptr == NULL) {
int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf);
struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req);
sbuf_cpy(s, buf);
err = sbuf_finish(s);
sbuf_delete(s);
return (err);
}
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
}
#endif
#endif /* _KERNEL */
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
@ -10540,6 +10826,15 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
ZMOD_RW,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,