diff --git a/include/sys/spa.h b/include/sys/spa.h index 18062d3f2a95..dc968bfb4f66 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -825,6 +825,11 @@ extern void spa_sync_allpools(void); extern uint_t zfs_sync_pass_deferred_free; +/* spa sync taskqueues */ +taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); +void spa_sync_tq_destroy(spa_t *spa); +void spa_select_allocator(zio_t *zio); + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 1a04bedc3137..4f914b89e2d1 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -188,6 +188,12 @@ typedef struct spa_taskqs { taskq_t **stqs_taskq; } spa_taskqs_t; +/* one for each thread in the spa sync taskq */ +typedef struct spa_syncthread_info { + kthread_t *sti_thread; + taskq_t *sti_wr_iss_tq; /* assigned wr_iss taskq */ +} spa_syncthread_info_t; + typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ @@ -265,6 +271,10 @@ struct spa { int spa_alloc_count; int spa_active_allocator; /* selectable allocator */ + /* per-allocator sync thread taskqs */ + taskq_t *spa_sync_tq; + spa_syncthread_info_t *spa_syncthreads; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ @@ -456,7 +466,7 @@ extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); diff --git a/include/sys/zio.h b/include/sys/zio.h index e1f4d5c04499..25a4b221f05e 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -223,6 +223,9 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_ALLOCATOR_NONE (-1) +#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) + #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) @@ -526,6 +529,9 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; + + /* write issue taskq selection, based upon sync thread */ + taskq_t *io_wr_iss_tq; }; enum blk_verify_flag { diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 3843419731b8..0a3cbc9f0b06 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -490,6 +490,14 @@ If we have less than this amount of free space, most ZPL operations (e.g. write, create) will return .Sy ENOSPC . . +.It Sy spa_num_allocators Ns = Ns Sy 4 Pq int +Determines the number of block alloctators to use per spa instance. +Capped by the number of actual CPUs in the system. +.Pp +This should be a multiple of the number of CPUs. +Note that setting this value too high could result in performance +degredation and/or excess fragmentation. +. .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint Limits the number of on-disk error log entries that will be converted to the new format when enabling the @@ -1971,13 +1979,6 @@ and may need to load new metaslabs to satisfy these allocations. .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint Rewrite new block pointers starting in this pass. . -.It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int -This controls the number of threads used by -.Sy dp_sync_taskq . -The default value of -.Sy 75% -will create a maximum of one thread per CPU. -. .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint Maximum size of TRIM command. Larger ranges will be split into chunks no larger than this value before @@ -2264,6 +2265,14 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_wr_iss_ncpus Ns = Ns Sy 32 Pq uint +Determines the number of CPUs to run write issue taskqs. +.Pp +While an optimal value will be system dependent, a suggested value +is the number of actual CPUs in the system, divided by the +.Sy spa_num_allocators +value. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 6321895c73e8..0a179fffb16a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4587,7 +4587,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } -/* May be called recursively from dbuf_sync_indirect(). */ +/* + * Syncs out a range of dirty records for indirect or leaf dbufs. May be + * called recursively from dbuf_sync_indirect(). + */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 2fe217475132..20a334a00529 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1649,6 +1649,7 @@ typedef struct sync_objset_arg { objset_t *soa_os; dmu_tx_t *soa_tx; zio_t *soa_zio; + taskq_ent_t soa_tq_ent; } sync_objset_arg_t; static void @@ -1688,8 +1689,8 @@ dmu_objset_sync_sublists_done(zio_t *zio) } /* sync_dnodes_finsh_task calls zil_sync on our behalf. */ - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - sync_dnodes_finish_task, soa, TQ_FRONT); + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_finish_task, soa, TQ_FRONT, &soa->soa_tq_ent); } /* Nonblocking objset sync. Called from dsl. */ @@ -1795,6 +1796,7 @@ dmu_objset_sync(objset_t *os, zio_t *rio, dmu_tx_t *tx) soa->soa_os = os; soa->soa_tx = tx; soa->soa_zio = zio; + taskq_init_ent(&soa->soa_tq_ent); /* sio is a child of the arc_write zio and parent of the sda_cio(s). */ zio_t *sio = zio_null(zio, os->os_spa, NULL, diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 1bf6a06f3952..3ec374b1d8a6 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -209,8 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); - dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - MIN(spa->spa_alloc_count, boot_ncpus), minclsyspri, 1, INT_MAX, 0); + dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq"); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -403,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); - taskq_destroy(dp->dp_sync_taskq); + spa_sync_tq_destroy(dp->dp_spa); /* * We can't set retry to TRUE since we're explicitly specifying diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cda62f939c1e..0c64d141840c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -99,6 +99,7 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +#include /* * The interval, in seconds, at which failed configuration cache file writes @@ -108,16 +109,16 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ + ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } -#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } +#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -138,14 +139,14 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE - * macros. Other operations process a large amount of data; the ZTI_BATCH + * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, - * but with number of taskqs also scaling with number of CPUs. + * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs + * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that @@ -155,7 +156,7 @@ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ + { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ @@ -174,6 +175,8 @@ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ +static uint_t zio_taskq_wr_iss_ncpus = 32; + static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ /* @@ -1024,17 +1027,28 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; - boolean_t batch = B_FALSE; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; - case ZTI_MODE_BATCH: - batch = B_TRUE; + case ZTI_MODE_SYNC: + /* + * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', + * not to exceed the number of spa allocators. + */ + count = MAX(1, boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + count = MIN(count, spa->spa_alloc_count); + + /* + * zio_taskq_batch_pct is unbounded and may exceed 100%, but no + * single taskq may have more threads than 100% of online cpus. + */ + value = (zio_taskq_batch_pct + count / 2) / count; + value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; - value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: @@ -1081,7 +1095,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " - "spa_activate()", + "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } @@ -1102,9 +1116,6 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; - (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); @@ -1162,12 +1173,11 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * on the taskq itself. */ -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +static taskq_t * +spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + zio_t *zio) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; @@ -1175,12 +1185,27 @@ spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { + /* dispatch to assigned write issue taskq */ + tq = zio->io_wr_iss_tq; + return (tq); + } + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } + return (tq); +} +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, + zio_t *zio) +{ + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); taskq_dispatch_ent(tq, func, arg, flags, ent); } @@ -1191,20 +1216,8 @@ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; - taskqid_t id; - - ASSERT3P(tqs->stqs_taskq, !=, NULL); - ASSERT3U(tqs->stqs_count, !=, 0); - - if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; - } - - id = taskq_dispatch(tq, func, arg, flags); + taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); + taskqid_t id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } @@ -9591,6 +9604,154 @@ spa_sync_allpools(void) mutex_exit(&spa_namespace_lock); } +static void spa_sync_tq_assign(void *arg); + +typedef struct sync_tq_arg { + kthread_t *sta_thread; + kcondvar_t sta_cv; + kmutex_t sta_lock; + int sta_ready; +} sync_tq_arg_t; + +/* unfortunately, taskqueues do not provide per-thread private data */ +static void +spa_sync_tq_assign(void *arg) +{ + sync_tq_arg_t *sta = arg; + + mutex_enter(&sta->sta_lock); + sta->sta_thread = curthread; + sta->sta_ready = 1; + cv_signal(&sta->sta_cv); + while (sta->sta_ready == 1) + cv_wait(&sta->sta_cv, &sta->sta_lock); + mutex_exit(&sta->sta_lock); +} + +taskq_t * +spa_sync_tq_create(spa_t *spa, const char *name) +{ + ASSERT(spa->spa_sync_tq == NULL); + ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); + + /* + * - do not allow more allocators than cpus. + * - there may be more cpus than allocators. + * - do not allow more sync taskq threads than allocators or cpus. + */ + int nthreads = spa->spa_alloc_count; + + sync_tq_arg_t *stq = kmem_zalloc(sizeof (*stq) * nthreads, KM_SLEEP); + spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * + nthreads, KM_SLEEP); + + spa->spa_sync_tq = taskq_create(name, nthreads, minclsyspri, nthreads, + INT_MAX, TASKQ_PREPOPULATE); + VERIFY(spa->spa_sync_tq != NULL); + + /* spawn all syncthreads */ + for (int i = 0; i < nthreads; i++) { + cv_init(&stq[i].sta_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&stq[i].sta_lock, NULL, MUTEX_DEFAULT, NULL); + (void) taskq_dispatch(spa->spa_sync_tq, spa_sync_tq_assign, + &stq[i], TQ_FRONT); + } + + /* wait on all syncthreads to start */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&stq[i].sta_lock); + while (stq[i].sta_ready == 0) + cv_wait(&stq[i].sta_cv, &stq[i].sta_lock); + mutex_exit(&stq[i].sta_lock); + } + + /* let all syncthreads resume, finish */ + for (int i = 0; i < nthreads; i++) { + mutex_enter(&stq[i].sta_lock); + stq[i].sta_ready = 2; + cv_broadcast(&stq[i].sta_cv); + mutex_exit(&stq[i].sta_lock); + } + taskq_wait(spa->spa_sync_tq); + + spa_taskqs_t *tqs = + &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + ti->sti_thread = stq[i].sta_thread; + if (w == tqs->stqs_count) { + w = 0; + } + ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + mutex_destroy(&stq[i].sta_lock); + cv_destroy(&stq[i].sta_cv); + } + kmem_free(stq, sizeof (*stq) * nthreads); + + return (spa->spa_sync_tq); +} + +void +spa_sync_tq_destroy(spa_t *spa) +{ + ASSERT(spa->spa_sync_tq != NULL); + + taskq_wait(spa->spa_sync_tq); + taskq_destroy(spa->spa_sync_tq); + kmem_free(spa->spa_syncthreads, + sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); + spa->spa_sync_tq = NULL; +} + +void +spa_select_allocator(zio_t *zio) +{ + zbookmark_phys_t *bm = &zio->io_bookmark; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * A gang block (for example) may have inherited its parent's + * allocator, in which case there is nothing further to do here. + */ + if (ZIO_HAS_ALLOCATOR(zio)) + return; + + ASSERT(spa != NULL); + ASSERT(bm != NULL); + + /* + * First try to use an allocator assigned to the syncthread, and set + * the corresponding write issue taskq for the allocator. + * Note, we must have an open pool to do this. + */ + if (spa->spa_sync_tq != NULL) { + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + zio->io_allocator = i; + zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + return; + } + } + } + + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, + bm->zb_blkid >> 20); + + zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; + zio->io_wr_iss_tq = NULL; +} + /* * ========================================================================== * Miscellaneous routines @@ -10181,3 +10342,6 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, + "Number of CPUs to run write issue taskqs"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index c7472f972cc2..b8697092a555 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -388,7 +388,11 @@ uint_t spa_asize_inflation = 24; uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; -static const int spa_allocators = 4; + +/* + * Number of allocators to use, per spa instance + */ +static int spa_num_allocators = 4; /* * Spa active allocator. @@ -730,7 +734,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - spa->spa_alloc_count = spa_allocators; + /* Do not allow more allocators than CPUs. */ + spa->spa_alloc_count = MIN(spa_num_allocators, boot_ncpus); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { @@ -739,6 +745,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, @@ -3009,3 +3016,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); + +ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, + "Number of allocators per spa, capped by ncpus"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73d8..48ae1a9b4923 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -900,6 +900,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; + zio->io_allocator = ZIO_ALLOCATOR_NONE; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); @@ -1991,7 +1992,7 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent); + &zio->io_tqent, zio); } static boolean_t @@ -2016,8 +2017,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) static zio_t * zio_issue_async(zio_t *zio) { + ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (NULL); } @@ -2331,6 +2332,9 @@ zio_wait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); mutex_enter(&zio->io_lock); @@ -2383,6 +2387,9 @@ zio_nowait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); } @@ -2841,6 +2848,13 @@ zio_gang_issue(zio_t *zio) return (zio); } +static void +zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) +{ + cio->io_allocator = pio->io_allocator; + cio->io_wr_iss_tq = pio->io_wr_iss_tq; +} + static void zio_write_gang_member_ready(zio_t *zio) { @@ -2912,6 +2926,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) gbh_copies = MIN(2, spa_max_replication(spa)); } + ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2975,6 +2990,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(pio, zio); + /* * Create and nowait the gang children. */ @@ -3005,6 +3022,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(zio, cio); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); @@ -3517,6 +3536,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to @@ -3553,21 +3573,12 @@ zio_dva_throttle(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - zbookmark_phys_t *bm = &zio->io_bookmark; - /* - * We want to try to use as many allocators as possible to help improve - * performance, but we also want logically adjacent IOs to be physically - * adjacent to improve sequential read performance. We chunk each object - * into 2^20 block regions, and then hash based on the objset, object, - * level, and region to accomplish both of these goals. - */ - int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, - bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - zio->io_allocator = allocator; + int allocator = zio->io_allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); @@ -3641,6 +3652,7 @@ zio_dva_allocate(zio_t *zio) * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ + ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -4493,6 +4505,7 @@ zio_ready(zio_t *zio) ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and @@ -4579,6 +4592,7 @@ zio_dva_throttle_done(zio_t *zio) } ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); @@ -4641,6 +4655,7 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); @@ -4906,7 +4921,7 @@ zio_done(zio_t *zio) ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent); + zio_reexecute, zio, 0, &zio->io_tqent, NULL); } return (NULL); }