Reintroduce priority for the TRIM ZIOs instead of using the "NOW" priority

The changes how TRIM requests are generated to use ZIO_TYPE_FREE + a priority
instead of ZIO_TYPE_IOCTL, until processed by vdev_geom; only then is it
translated the required geom values. This reduces the amount of changes
required for FREE requests to be supported by the new IO scheduler. This
also eliminates the need for a specific DKIOCTRIM.

Also fixed FREE vdev child IO's from running ZIO_STAGE_VDEV_IO_DONE as part
of their schedule.

As the new IO scheduler can result in a request to execute one type of IO to
actually run a different type of IO it requires that zio_trim requests are
processed without holding the trim map lock (tm->tm_lock), as the free request
execute call may result in write request running hence triggering a
trim_map_write_start call, which takes the trim map lock and hence would result
in recused on no-recursive sx lock.

This is based off avg's original work, so credit to him.

MFC after:	1 month
This commit is contained in:
Steven Hartland 2014-04-30 17:46:29 +00:00
parent b28e753c93
commit 82ce008538
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=265152
7 changed files with 124 additions and 73 deletions

View file

@ -75,8 +75,6 @@ extern "C" {
*/
#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
#define DKIOCTRIM (DKIOC|35) /* TRIM a block */
struct dk_callback {
void (*dkc_callback)(void *dkc_cookie, int error);
void *dkc_cookie;

View file

@ -135,9 +135,10 @@ typedef enum zio_priority {
ZIO_PRIORITY_ASYNC_READ, /* prefetch */
ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */
} zio_priority_t;
#define ZIO_PIPELINE_CONTINUE 0x100
@ -508,7 +509,7 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
enum zio_flag flags);
zio_priority_t priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,

View file

@ -215,6 +215,10 @@ enum zio_stage {
ZIO_STAGE_FREE_BP_INIT | \
ZIO_STAGE_DVA_FREE)
#define ZIO_FREE_PHYS_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_VDEV_IO_STAGES)
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \

View file

@ -449,7 +449,7 @@ trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
{
trim_map_t *tm = vd->vdev_trimmap;
trim_seg_t *ts;
uint64_t size, txgtarget, txgsafe;
uint64_t size, offset, txgtarget, txgsafe;
hrtime_t timelimit;
ASSERT(vd->vdev_ops->vdev_op_leaf);
@ -477,9 +477,20 @@ trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
avl_remove(&tm->tm_queued_frees, ts);
avl_add(&tm->tm_inflight_frees, ts);
size = ts->ts_end - ts->ts_start;
zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
offset = ts->ts_start;
TRIM_MAP_SDEC(tm, size);
TRIM_MAP_QDEC(tm);
/*
* We drop the lock while we call zio_nowait as the IO
* scheduler can result in a different IO being run e.g.
* a write which would result in a recursive lock.
*/
mutex_exit(&tm->tm_lock);
zio_nowait(zio_trim(zio, spa, vd, offset, size));
mutex_enter(&tm->tm_lock);
ts = trim_map_first(tm, txgtarget, txgsafe, timelimit);
}
mutex_exit(&tm->tm_lock);
}

View file

@ -800,10 +800,11 @@ vdev_geom_io_start(zio_t *zio)
vd = zio->io_vd;
if (zio->io_type == ZIO_TYPE_IOCTL) {
switch (zio->io_type) {
case ZIO_TYPE_IOCTL:
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
zio->io_error = SET_ERROR(ENXIO);
return (ZIO_PIPELINE_CONTINUE);
}
@ -812,28 +813,28 @@ vdev_geom_io_start(zio_t *zio)
if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
break;
if (vd->vdev_nowritecache) {
zio->io_error = ENOTSUP;
break;
}
goto sendreq;
case DKIOCTRIM:
if (vdev_geom_bio_delete_disable)
break;
if (vd->vdev_notrim) {
zio->io_error = ENOTSUP;
zio->io_error = SET_ERROR(ENOTSUP);
break;
}
goto sendreq;
default:
zio->io_error = ENOTSUP;
zio->io_error = SET_ERROR(ENOTSUP);
}
return (ZIO_PIPELINE_CONTINUE);
case ZIO_TYPE_FREE:
if (vdev_geom_bio_delete_disable)
return (ZIO_PIPELINE_CONTINUE);
if (vd->vdev_notrim) {
zio->io_error = SET_ERROR(ENOTSUP);
return (ZIO_PIPELINE_CONTINUE);
}
}
sendreq:
cp = vd->vdev_tsd;
if (cp == NULL) {
zio->io_error = ENXIO;
zio->io_error = SET_ERROR(ENXIO);
return (ZIO_PIPELINE_CONTINUE);
}
bp = g_alloc_bio();
@ -846,21 +847,20 @@ vdev_geom_io_start(zio_t *zio)
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
break;
case ZIO_TYPE_FREE:
bp->bio_cmd = BIO_DELETE;
bp->bio_data = NULL;
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
break;
case ZIO_TYPE_IOCTL:
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
bp->bio_cmd = BIO_FLUSH;
bp->bio_flags |= BIO_ORDERED;
bp->bio_data = NULL;
bp->bio_offset = cp->provider->mediasize;
bp->bio_length = 0;
break;
case DKIOCTRIM:
bp->bio_cmd = BIO_DELETE;
bp->bio_data = NULL;
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
break;
}
break;
}

View file

@ -40,9 +40,9 @@
*
* ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
* I/O scheduler determines when and in what order those operations are
* issued. The I/O scheduler divides operations into five I/O classes
* issued. The I/O scheduler divides operations into six I/O classes
* prioritized in the following order: sync read, sync write, async read,
* async write, and scrub/resilver. Each queue defines the minimum and
* async write, scrub/resilver and trim. Each queue defines the minimum and
* maximum number of concurrent operations that may be issued to the device.
* In addition, the device has an aggregate maximum. Note that the sum of the
* per-queue minimums must not exceed the aggregate maximum, and if the
@ -61,7 +61,7 @@
* done in the order specified above. No further operations are issued if the
* aggregate maximum number of concurrent operations has been hit or if there
* are no operations queued for an I/O class that has not hit its maximum.
* Every time an i/o is queued or an operation completes, the I/O scheduler
* Every time an I/O is queued or an operation completes, the I/O scheduler
* looks for new operations to issue.
*
* All I/O classes have a fixed maximum number of outstanding operations
@ -70,7 +70,7 @@
* transaction groups (see txg.c). Transaction groups enter the syncing state
* periodically so the number of queued async writes will quickly burst up and
* then bleed down to zero. Rather than servicing them as quickly as possible,
* the I/O scheduler changes the maximum number of active async write i/os
* the I/O scheduler changes the maximum number of active async write I/Os
* according to the amount of dirty data in the pool (see dsl_pool.c). Since
* both throughput and latency typically increase with the number of
* concurrent operations issued to physical devices, reducing the burstiness
@ -113,14 +113,14 @@
*/
/*
* The maximum number of i/os active to each device. Ideally, this will be >=
* The maximum number of I/Os active to each device. Ideally, this will be >=
* the sum of each queue's max_active. It must be at least the sum of each
* queue's min_active.
*/
uint32_t zfs_vdev_max_active = 1000;
/*
* Per-queue limits on the number of i/os active to each device. If the
* Per-queue limits on the number of I/Os active to each device. If the
* sum of the queue's max_active is < zfs_vdev_max_active, then the
* min_active comes into play. We will send min_active from each queue,
* and then select from queues in the order defined by zio_priority_t.
@ -145,6 +145,14 @@ uint32_t zfs_vdev_async_write_min_active = 1;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
uint32_t zfs_vdev_trim_min_active = 1;
/*
* TRIM max active is large in comparison to the other values due to the fact
* that TRIM IOs are coalesced at the device layer. This value is set such
* that a typical SSD can process the queued IOs in a single request.
*/
uint32_t zfs_vdev_trim_max_active = 64;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@ -171,7 +179,7 @@ SYSCTL_DECL(_vfs_zfs_vdev);
TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active);
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RW,
&zfs_vdev_max_active, 0,
"The maximum number of i/os of all types active for each device.");
"The maximum number of I/Os of all types active for each device.");
#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \
@ -199,6 +207,8 @@ ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
ZFS_VDEV_QUEUE_KNOB_MIN(trim);
ZFS_VDEV_QUEUE_KNOB_MAX(trim);
#undef ZFS_VDEV_QUEUE_KNOB
@ -299,6 +309,7 @@ static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
@ -315,6 +326,7 @@ static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
@ -403,6 +415,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_async_write_min_active);
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
default:
panic("invalid priority %u", p);
return (0);
@ -454,6 +468,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
spa->spa_dsl_pool->dp_dirty_total));
case ZIO_PRIORITY_SCRUB:
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
default:
panic("invalid priority %u", p);
return (0);
@ -470,6 +486,8 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p;
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
@ -511,10 +529,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
zio_t *first, *last, *aio, *dio, *mandatory, *nio;
uint64_t maxgap = 0;
uint64_t size;
boolean_t stretch = B_FALSE;
vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
avl_tree_t *t = &vqc->vqc_queued_tree;
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
boolean_t stretch;
avl_tree_t *t;
enum zio_flag flags;
ASSERT(MUTEX_HELD(&vq->vq_lock));
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
@ -552,6 +571,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-option I/O.
*/
flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
t = &vq->vq_class[zio->io_priority].vqc_queued_tree;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
@ -591,6 +612,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* non-optional I/O is close enough to make aggregation
* worthwhile.
*/
stretch = B_FALSE;
if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
zio_t *nio = last;
while ((dio = AVL_NEXT(t, nio)) != NULL &&
@ -731,11 +753,13 @@ vdev_queue_io(zio_t *zio)
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
} else {
ASSERT(zio->io_type == ZIO_TYPE_FREE);
zio->io_priority = ZIO_PRIORITY_TRIM;
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;

View file

@ -788,6 +788,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
stage |= ZIO_STAGE_ISSUE_ASYNC;
flags |= ZIO_FLAG_DONT_QUEUE;
zio = zio_create(pio, spa, txg, bp, NULL, size,
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
@ -827,14 +829,14 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private,
enum zio_flag flags)
zio_priority_t priority, enum zio_flag flags)
{
zio_t *zio;
int c;
if (vd->vdev_children == 0) {
zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL,
ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
zio->io_cmd = cmd;
@ -843,7 +845,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
for (c = 0; c < vd->vdev_children; c++)
zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
offset, size, done, private, flags));
offset, size, done, private, priority, flags));
}
return (zio);
@ -928,6 +930,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/* Not all IO types require vdev io done stage e.g. free */
if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
if (vd->vdev_children == 0)
offset += VDEV_LABEL_START_SIZE;
@ -973,7 +979,7 @@ void
zio_flush(zio_t *zio, vdev_t *vd)
{
zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
NULL, NULL,
NULL, NULL, ZIO_PRIORITY_NOW,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
@ -983,9 +989,10 @@ zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
ASSERT(vd->vdev_ops->vdev_op_leaf);
return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
NULL, NULL,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
}
void
@ -2530,7 +2537,8 @@ zio_vdev_io_start(zio_t **ziop)
return (vdev_mirror_ops.vdev_op_io_start(zio));
}
if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
zio->io_priority == ZIO_PRIORITY_NOW) {
trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
return (ZIO_PIPELINE_CONTINUE);
}
@ -2598,31 +2606,33 @@ zio_vdev_io_start(zio_t **ziop)
return (ZIO_PIPELINE_CONTINUE);
}
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
if (vd->vdev_ops->vdev_op_leaf) {
switch (zio->io_type) {
case ZIO_TYPE_READ:
if (vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
/* FALLTHROUGH */
case ZIO_TYPE_WRITE:
case ZIO_TYPE_FREE:
if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP);
*ziop = zio;
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (ZIO_PIPELINE_CONTINUE);
if ((zio = vdev_queue_io(zio)) == NULL)
return (ZIO_PIPELINE_STOP);
*ziop = zio;
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
return (ZIO_PIPELINE_STOP);
}
break;
}
}
/*
* Note that we ignore repair writes for TRIM because they can conflict
* with normal writes. This isn't an issue because, by definition, we
* only repair blocks that aren't freed.
*/
if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
if (!trim_map_write_start(zio))
/*
* Note that we ignore repair writes for TRIM because they can
* conflict with normal writes. This isn't an issue because, by
* definition, we only repair blocks that aren't freed.
*/
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!trim_map_write_start(zio))
return (ZIO_PIPELINE_STOP);
}
@ -2644,7 +2654,8 @@ zio_vdev_io_done(zio_t **ziop)
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_FREE)) {
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR))
@ -2725,7 +2736,8 @@ zio_vdev_io_assess(zio_t **ziop)
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
if (zio->io_type == ZIO_TYPE_FREE &&
zio->io_priority != ZIO_PRIORITY_NOW) {
switch (zio->io_error) {
case 0:
ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
@ -2738,6 +2750,7 @@ zio_vdev_io_assess(zio_t **ziop)
ZIO_TRIM_STAT_BUMP(failed);
break;
}
}
/*
* If the I/O failed, determine whether we should attempt to retry it.