diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c index d3502309eab5..62c98f407198 100644 --- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ /* @@ -5137,6 +5138,7 @@ ztest_run(ztest_shared_t *zs) */ kernel_init(FREAD | FWRITE); VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); + spa->spa_debug = B_TRUE; zs->zs_spa = spa; spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 17b4b12c4ee4..a0620b889347 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -30,9 +31,34 @@ #include #include +/* + * Allow allocations to switch to gang blocks quickly. We do this to + * avoid having to load lots of space_maps in a given txg. There are, + * however, some cases where we want to avoid "fast" ganging and instead + * we want to do an exhaustive search of all metaslabs on this device. + * Currently we don't allow any gang or dump device related allocations + * to "fast" gang. + */ +#define CAN_FASTGANG(flags) \ + (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ + METASLAB_GANG_AVOID))) + uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +/* + * This value defines the number of allowed allocation failures per vdev. + * If a device reaches this threshold in a given txg then we consider skipping + * allocations on that device. + */ +int zfs_mg_alloc_failures = 0; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN, + &zfs_mg_alloc_failures, 0, + "Number of allowed allocation failures per vdev"); +TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures); + /* * Metaslab debugging: when set, keeps all space maps in core to verify frees. */ @@ -671,7 +697,7 @@ static space_map_ops_t metaslab_ndf_ops = { metaslab_ndf_fragmented }; -space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== @@ -844,7 +870,7 @@ metaslab_prefetch(metaslab_group_t *mg) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) { metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; @@ -877,13 +903,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) mutex_exit(&mg->mg_lock); } - /* - * If we were able to load the map then make sure - * that this map is still able to satisfy our request. - */ - if (msp->ms_weight < size) - return (ENOSPC); - metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -1099,6 +1118,7 @@ void metaslab_sync_reassess(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; + int64_t failures = mg->mg_alloc_failures; /* * Re-evaluate all metaslabs which have lower offsets than the @@ -1115,6 +1135,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) mutex_exit(&msp->ms_lock); } + atomic_add_64(&mg->mg_alloc_failures, -failures); + /* * Prefetch the next potential metaslabs */ @@ -1139,9 +1161,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva) } static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, + uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) { + spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp = NULL; uint64_t offset = -1ULL; avl_tree_t *t = &mg->mg_metaslab_tree; @@ -1162,11 +1185,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { + if (msp->ms_weight < asize) { + spa_dbgmsg(spa, "%s: failed to meet weight " + "requirement: vdev %llu, txg %llu, mg %p, " + "msp %p, psize %llu, asize %llu, " + "failures %llu, weight %llu", + spa_name(spa), mg->mg_vd->vdev_id, txg, + mg, msp, psize, asize, + mg->mg_alloc_failures, msp->ms_weight); mutex_exit(&mg->mg_lock); return (-1ULL); } - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -1185,6 +1214,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp == NULL) return (-1ULL); + /* + * If we've already reached the allowable number of failed + * allocation attempts on this metaslab group then we + * consider skipping it. We skip it only if we're allowed + * to "fast" gang, the physical size is larger than + * a gang block, and we're attempting to allocate from + * the primary metaslab. + */ + if (mg->mg_alloc_failures > zfs_mg_alloc_failures && + CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && + activation_weight == METASLAB_WEIGHT_PRIMARY) { + spa_dbgmsg(spa, "%s: skipping metaslab group: " + "vdev %llu, txg %llu, mg %p, psize %llu, " + "asize %llu, failures %llu", spa_name(spa), + mg->mg_vd->vdev_id, txg, mg, psize, asize, + mg->mg_alloc_failures); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); /* @@ -1193,7 +1241,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size || (was_active && + if (msp->ms_weight < asize || (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK) && activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); @@ -1208,14 +1256,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight, size) != 0) { + if (metaslab_activate(msp, activation_weight) != 0) { mutex_exit(&msp->ms_lock); continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) break; + atomic_inc_64(&mg->mg_alloc_failures); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); @@ -1224,7 +1274,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); mutex_exit(&msp->ms_lock); @@ -1351,7 +1401,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + offset = metaslab_group_alloc(mg, psize, asize, txg, distance, + dva, d, flags); if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -1363,18 +1414,24 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ mg->mg_bias = ((cu - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + (int64_t)mg->mg_aliquot) / 100; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= @@ -1488,7 +1545,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) error = ENOENT; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 1709f6884c9d..91383091c634 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -1674,3 +1675,9 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) return (0); } + +boolean_t +spa_debug_enabled(spa_t *spa) +{ + return (spa->spa_debug); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 583d6303bd5a..2cf4d2b489bd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 #define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_GANG_AVOID 0x8 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 07988dd51a73..6c670a1624ab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -52,6 +53,7 @@ struct metaslab_group { avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; uint64_t mg_bonus_area; + uint64_t mg_alloc_failures; int64_t mg_bias; int64_t mg_activation_count; metaslab_class_t *mg_class; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index 9cdec18bea76..fe6e72755618 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_SPA_H @@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif +extern boolean_t spa_debug_enabled(spa_t *spa); +#define spa_dbgmsg(spa, ...) \ +{ \ + if (spa_debug_enabled(spa)) \ + zfs_dbgmsg(__VA_ARGS__); \ +} + extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index a2f15d2863fa..2dbe552c724b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #ifndef _SYS_SPA_IMPL_H @@ -196,6 +197,7 @@ struct spa { kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ + boolean_t spa_debug; /* debug enabled? */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 9a043449c8ad..5f8f00b6a577 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. */ #include @@ -85,6 +86,7 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif +extern int zfs_mg_alloc_failures; /* * An allocating zio is one that either currently has the DVA allocate @@ -160,6 +162,15 @@ zio_init(void) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } + /* + * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs + * to fail 3 times per txg or 8 failures, whichever is greater. + */ + if (zfs_mg_alloc_failures == 0) + zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); + else if (zfs_mg_alloc_failures < 8) + zfs_mg_alloc_failures = 8; + zio_inject_init(); } @@ -2135,6 +2146,7 @@ zio_dva_allocate(zio_t *zio) metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; + int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2147,10 +2159,21 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + /* + * The dump device does not support gang blocks so allocation on + * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid + * the "fast" gang feature. + */ + flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; + flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? + METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_copies, zio->io_txg, NULL, 0); + zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { + spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error;