From e96772321ab32595d921c1328bcce045dad4bb3c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 29 Sep 2023 12:28:55 +1000 Subject: [PATCH 1/6] spa_stats: kstats for unflushed log spacemaps Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/spa.h | 4 ++- module/zfs/spa.c | 5 ++- module/zfs/spa_stats.c | 82 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index ca30b60c0af7..1d3e488a2289 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -28,7 +28,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Datto Inc. */ @@ -911,6 +911,7 @@ typedef struct spa_stats { spa_history_kstat_t state; /* pool state */ spa_history_kstat_t guid; /* pool guid */ spa_history_kstat_t iostats; + spa_history_kstat_t log_spacemaps; } spa_stats_t; typedef enum txg_state { @@ -1029,6 +1030,7 @@ typedef enum spa_log_state { extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); +extern void spa_log_sm_stats_update(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 6b8c7ee93daa..c84dd3bac16f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,7 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. - * Copyright (c) 2023, 2024, Klara Inc. + * Copyright (c) 2023, 2024, Klara, Inc. */ /* @@ -2012,6 +2012,8 @@ spa_unload_log_sm_metadata(spa_t *spa) spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; + + spa_log_sm_stats_update(spa); } static void @@ -10255,6 +10257,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); + spa_log_sm_stats_update(spa); if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) vdev_autotrim_kick(spa); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 45a2f06263a0..210108a2cb05 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -19,6 +19,10 @@ * CDDL HEADER END */ +/* + * Copyright (c) 2024, Klara, Inc. + */ + #include #include #include @@ -1034,6 +1038,82 @@ spa_iostats_destroy(spa_t *spa) mutex_destroy(&shk->lock); } +/* + * Log spacemap stats. + */ +typedef struct spa_log_sm_stats { + kstat_named_t unflushed_memused; + kstat_named_t unflushed_blocklimit; + kstat_named_t unflushed_nblocks; +} spa_log_sm_stats_t; + +static spa_log_sm_stats_t spa_log_sm_stats_template = { + { "unflushed_memused", KSTAT_DATA_UINT64 }, + { "unflushed_blocklimit", KSTAT_DATA_UINT64 }, + { "unflushed_nblocks", KSTAT_DATA_UINT64 } +}; + +#define SPA_LOG_SM_STATS_SET(stat, val) \ + atomic_store_64(&log_sm_stats->stat.value.ui64, (val)); + +void +spa_log_sm_stats_update(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_log_sm_stats_t *log_sm_stats = ksp->ks_data; + + SPA_LOG_SM_STATS_SET(unflushed_memused, + spa->spa_unflushed_stats.sus_memused); + SPA_LOG_SM_STATS_SET(unflushed_blocklimit, + spa->spa_unflushed_stats.sus_blocklimit); + SPA_LOG_SM_STATS_SET(unflushed_nblocks, + spa->spa_unflushed_stats.sus_nblocks); +} + +static void +spa_log_sm_stats_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + char *name = kmem_asprintf("zfs/%s", spa_name(spa)); + kstat_t *ksp = kstat_create(name, 0, "log_spacemaps", "misc", + KSTAT_TYPE_NAMED, + sizeof (spa_log_sm_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = + kmem_alloc(sizeof (spa_log_sm_stats_t), KM_SLEEP); + memcpy(ksp->ks_data, &spa_log_sm_stats_template, + sizeof (spa_log_sm_stats_t)); + kstat_install(ksp); + } + + kmem_strfree(name); +} + +static void +spa_log_sm_stats_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.log_spacemaps; + kstat_t *ksp = shk->kstat; + if (ksp) { + kmem_free(ksp->ks_data, sizeof (spa_log_sm_stats_t)); + kstat_delete(ksp); + } + + mutex_destroy(&shk->lock); +} + void spa_stats_init(spa_t *spa) { @@ -1044,11 +1124,13 @@ spa_stats_init(spa_t *spa) spa_state_init(spa); spa_guid_init(spa); spa_iostats_init(spa); + spa_log_sm_stats_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_log_sm_stats_destroy(spa); spa_iostats_destroy(spa); spa_health_destroy(spa); spa_tx_assign_destroy(spa); From aa2bcb0305a0c2e7e1323309e80f9d4882f699ba Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 29 Sep 2023 14:53:27 +1000 Subject: [PATCH 2/6] log_spacemap: extend pool flushall to have "request" and "export" modes Normally, log spacemaps are flushed out to the metaslabs when the pool is exported. For large logs, this can lead to export taking an inordinate amount of time. This commit adds a "mode" parameter for the log spacemap "flushall" operation, and functions for starting and stopping it in a particular mode. The existing behaviour of flushing everything is now the "export" mode. Then, we add a new "request" mode, that can be triggered externally. This mode differs in that it only flushes spacemaps that were dirtied before the current transaction, stopping when the only dirty ones remaining, if any, are newer. This commit only adds the behaviours and sets up the entry points; the next commit will add something to call them. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- include/sys/spa_impl.h | 2 + include/sys/spa_log_spacemap.h | 12 ++- man/man4/zfs.4 | 6 +- module/zfs/spa.c | 6 +- module/zfs/spa_log_spacemap.c | 152 ++++++++++++++++++++++++++++++--- 5 files changed, 164 insertions(+), 14 deletions(-) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9ce3..efe18d5410ff 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -27,6 +27,7 @@ * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_SPA_IMPL_H @@ -358,6 +359,7 @@ struct spa { avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; + spa_log_flushall_mode_t spa_log_flushall_mode; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h index f59e69917833..e9a0826563dc 100644 --- a/include/sys/spa_log_spacemap.h +++ b/include/sys/spa_log_spacemap.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _SYS_SPA_LOG_SPACEMAP_H @@ -56,6 +57,12 @@ typedef struct spa_log_sm { space_map_t *sls_sm; /* space map pointer, if open */ } spa_log_sm_t; +typedef enum spa_log_flushall_mode { + SPA_LOG_FLUSHALL_NONE = 0, /* flushall inactive */ + SPA_LOG_FLUSHALL_REQUEST, /* flushall active by admin request */ + SPA_LOG_FLUSHALL_EXPORT, /* flushall active for pool export */ +} spa_log_flushall_mode_t; + int spa_ld_log_spacemaps(spa_t *); void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *); @@ -77,7 +84,10 @@ void spa_log_summary_dirty_flushed_metaslab(spa_t *, uint64_t); void spa_log_summary_decrement_mscount(spa_t *, uint64_t, boolean_t); void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); -boolean_t spa_flush_all_logs_requested(spa_t *); +void spa_log_flushall_start(spa_t *spa, spa_log_flushall_mode_t mode, + uint64_t txg); +void spa_log_flushall_done(spa_t *spa); +void spa_log_flushall_cancel(spa_t *spa); extern int zfs_keep_log_spacemaps_at_export; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index da027798f962..828a65d6edc8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -18,7 +18,7 @@ .\" .\" Copyright (c) 2024, Klara, Inc. .\" -.Dd November 1, 2024 +.Dd November 11, 2024 .Dt ZFS 4 .Os . @@ -1778,6 +1778,10 @@ Normally disabled because these datasets may be missing key data. .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . +.It Sy zfs_min_metaslabs_to_flush_all Ns = Ns Sy 5 Pq u64 +Minimum number of metaslabs to flush per dirty TXG when condensing log +spacemaps. +. .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c84dd3bac16f..4ffac89f70a2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1984,8 +1984,8 @@ spa_unload_log_sm_flush_all(spa_t *spa) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - ASSERT3U(spa->spa_log_flushall_txg, ==, 0); - spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); + spa_log_flushall_start(spa, SPA_LOG_FLUSHALL_EXPORT, + dmu_tx_get_txg(tx)); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); @@ -2070,6 +2070,8 @@ spa_unload(spa_t *spa) */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); + else + spa_log_flushall_done(spa); /* * Stop async tasks. diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index f55218e3579b..a1654bec9036 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara, Inc. */ #include @@ -284,6 +285,12 @@ static uint64_t zfs_max_logsm_summary_length = 10; */ static uint64_t zfs_min_metaslabs_to_flush = 1; +/* + * Tuneable that sets the minimum metaslabs to flush every TXG when the user + * has requested flushall (via 'zpool condense'). + */ +static uint64_t zfs_min_metaslabs_to_flush_all = 5; + /* * Tunable that specifies how far in the past do we want to look when trying to * estimate the incoming log blocks for the current TXG. @@ -676,7 +683,9 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; + uint64_t max_flushes_pertxg = + spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST ? + zfs_min_metaslabs_to_flush_all : zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -746,10 +755,83 @@ spa_log_exceeds_memlimit(spa_t *spa) return (B_FALSE); } -boolean_t -spa_flush_all_logs_requested(spa_t *spa) +void +spa_log_flushall_start(spa_t *spa, spa_log_flushall_mode_t mode, uint64_t txg) { - return (spa->spa_log_flushall_txg != 0); + /* Shouldn't happen, but its not dangerous if it does. */ + ASSERT3U(mode, !=, SPA_LOG_FLUSHALL_NONE); + if (mode == SPA_LOG_FLUSHALL_NONE) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (txg == 0) + txg = spa_last_synced_txg(spa); + + if (spa->spa_log_flushall_mode != SPA_LOG_FLUSHALL_EXPORT) { + /* + * We can set _REQUEST even if its already in _REQUEST; this + * has the effect of just pushing out the end txg. + */ + spa->spa_log_flushall_mode = mode; + spa->spa_log_flushall_txg = txg; + } + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* Reset stats */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + memset(pcns, 0, sizeof (pool_condense_stat_t)); + pcns->pcns_start_time = gethrestime_sec(); + pcns->pcns_total = spa_log_sm_nblocks(spa); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void +spa_log_flushall_done(spa_t *spa) +{ + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) + return; + + IMPLY(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST, + spa_state(spa) == POOL_STATE_ACTIVE); + IMPLY(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT, + spa_state(spa) == POOL_STATE_EXPORTED); + ASSERT(spa->spa_log_flushall_txg); + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* + * Finish stats. Note that the flush is by txgs, not blocks, so + * we set the processed to the total just so everything looks + * right for the user even if they're not exactly the same. + */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_end_time = gethrestime_sec(); + pcns->pcns_processed = pcns->pcns_total; + } + + spa->spa_log_flushall_mode = SPA_LOG_FLUSHALL_NONE; + spa->spa_log_flushall_txg = 0; +} + +void +spa_log_flushall_cancel(spa_t *spa) +{ + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) + return; + + ASSERT(spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST); + + spa->spa_log_flushall_mode = SPA_LOG_FLUSHALL_NONE; + spa->spa_log_flushall_txg = 0; + + /* Finish stats. */ + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_end_time = gethrestime_sec(); } void @@ -785,7 +867,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) */ if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && - !spa_flush_all_logs_requested(spa)) + spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_NONE) return; /* @@ -809,16 +891,25 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * metaslabs and attempt to destroy old log space maps. */ uint64_t want_to_flush; - if (spa_flush_all_logs_requested(spa)) { + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - /* Used purely for verification purposes */ + /* + * Count of metaslabs we checked this round. Used to know we've + * finished a user-requested flushall, and for verification. + */ uint64_t visited = 0; + /* + * Unflushed blocks at start of loop, so we can report on how many we + * flushed. + */ + uint64_t start_nblocks = spa_log_sm_nblocks(spa); + /* * Ideally we would only iterate through spa_metaslabs_by_flushed * using only one variable (curr). We can't do that because @@ -836,8 +927,30 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * If this metaslab has been flushed this txg then we've done * a full circle over the metaslabs. */ - if (metaslab_unflushed_txg(curr) == txg) + uint64_t unflushed_txg = metaslab_unflushed_txg(curr); + if (unflushed_txg == txg) { + spa_log_flushall_done(spa); + break; + } + + /* + * If the admin requested flush, skip metaslabs that were + * modified after the flush request. + */ + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST && + unflushed_txg > spa->spa_log_flushall_txg) { + visited++; + if (visited < + avl_numnodes(&spa->spa_metaslabs_by_flushed)) + continue; + + /* + * We visited all metaslabs and they're all dirty after + * the admin requested flush, so all flushing is done. + */ + spa_log_flushall_done(spa); break; + } /* * If we are done flushing for the block heuristic and the @@ -862,6 +975,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); spa_log_sm_set_blocklimit(spa); + + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_REQUEST) { + /* + * If the admin requested a flush, then we're only processing + * blocks created before the flush request. The total number of + * unflushed blocks can still go up, but not since we set + * start_nblocks before the metaslab loop above. Therefore, + * there can never be more blocks than there were at the start. + */ + uint64_t end_nblocks = spa_log_sm_nblocks(spa); + ASSERT3U(start_nblocks, >=, end_nblocks); + + pool_condense_stat_t *pcns = + &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; + pcns->pcns_processed += start_nblocks - end_nblocks; + } } /* @@ -901,9 +1030,9 @@ spa_sync_close_syncing_log_sm(spa_t *spa) * so the last few TXGs before closing the pool can be empty * (e.g. not dirty). */ - if (spa_flush_all_logs_requested(spa)) { + if (spa->spa_log_flushall_mode == SPA_LOG_FLUSHALL_EXPORT) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - spa->spa_log_flushall_txg = 0; + spa_log_flushall_done(spa); } } @@ -1396,6 +1525,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, "The number of past TXGs that the flushing algorithm of the log " "spacemap feature uses to estimate incoming log blocks"); +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush_all, ULONG, ZMOD_RW, + "Minimum number of metaslabs to flush per TXG when condensing"); + ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, "Prevent the log spacemaps from being flushed and destroyed " "during pool export/destroy"); From 8205d5a66b5a4f42b02ee7c4d0d0ebeeaba35090 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 29 Sep 2023 17:03:40 +1000 Subject: [PATCH 3/6] zpool: add condense verb and wire up to log-spacemap flush The idea is to have a single command that could signal to any background cleanup task that it should do its work faster, or care less about not getting in the way of user IO, or whatever. This adds the the `zpool condense` command, the `ZFS_IOC_POOL_CONDENSE` ioctl and counters so userspace can get progress. Because the target could be anything, there's no particular unit, just a total number of items to condense and count of how many done. It also adds a `log-spacemap` condense target, which calls the "request log flush" function. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- cmd/zpool/zpool_main.c | 217 +++++++++++++++++++++++++++++++- include/libzfs.h | 3 + include/libzfs_core.h | 4 + include/sys/fs/zfs.h | 36 ++++++ include/sys/spa_impl.h | 4 + lib/libzfs/libzfs.abi | 27 ++++ lib/libzfs/libzfs_pool.c | 19 ++- lib/libzfs_core/libzfs_core.abi | 21 ++++ lib/libzfs_core/libzfs_core.c | 18 +++ man/Makefile.am | 1 + man/man8/zpool-condense.8 | 62 +++++++++ man/man8/zpool-status.8 | 4 +- man/man8/zpool-wait.8 | 6 +- man/man8/zpool.8 | 5 +- module/zfs/vdev_label.c | 9 ++ module/zfs/zfs_ioctl.c | 70 ++++++++++- 16 files changed, 500 insertions(+), 6 deletions(-) create mode 100644 man/man8/zpool-condense.8 diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 4458b902de31..35cc24f1ee45 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -32,7 +32,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2024, Klara, Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ @@ -126,6 +126,7 @@ static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); +static int zpool_do_condense(int, char **); static int zpool_do_version(int, char **); @@ -173,6 +174,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_CONDENSE, HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, @@ -360,6 +362,16 @@ static const char *vdev_trim_state_str[] = { "COMPLETE" }; +static const char *condense_type_str[POOL_CONDENSE_TYPES] = { + "log spacemap", +}; +static const char *condense_type_nv_str[POOL_CONDENSE_TYPES] = { + "log_spacemap", +}; +static const char *condense_type_unit_str[POOL_CONDENSE_TYPES] = { + "blocks", +}; + #define ZFS_NICE_TIMESTAMP 100 /* @@ -416,6 +428,7 @@ static zpool_command_t command_table[] = { { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, + { "condense", zpool_do_condense, HELP_CONDENSE }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, @@ -427,6 +440,7 @@ static zpool_command_t command_table[] = { { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, + { NULL }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, @@ -546,6 +560,8 @@ get_usage(zpool_help_t idx) return (gettext("\treguid [-g guid] \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); + case HELP_CONDENSE: + return (gettext("\tcondense -t [-c] \n")); case HELP_VERSION: return (gettext("\tversion [-j]\n")); case HELP_WAIT: @@ -8688,6 +8704,105 @@ zpool_do_trim(int argc, char **argv) return (error); } +typedef struct { + pool_condense_func_t func; + pool_condense_type_t type; +} condense_cb_t; + +static int +condense_cb(zpool_handle_t *zhp, void *data) +{ + condense_cb_t *cb = data; + return (zpool_condense(zhp, cb->func, cb->type)); +} + +/* + * zpool condense -t [-c] + * + * -t What to condense. + * -c Cancel. Ends any in-progress condense. + * + * Condense (flush) the log spacemap on the specified pool(s). + */ +static int +zpool_do_condense(int argc, char **argv) +{ + struct option long_options[] = { + {"target", required_argument, NULL, 't'}, + {"cancel", no_argument, NULL, 'c'}, + {0, 0, 0, 0} + }; + + struct target_map { + const char *name; + pool_condense_type_t type; + } targets[] = { + {"log-spacemap", POOL_CONDENSE_LOG_SPACEMAP}, + {0, 0} + }; + + condense_cb_t cb = { + .func = POOL_CONDENSE_START, + .type = POOL_CONDENSE_TYPES, + }; + + int c; + while ((c = getopt_long(argc, argv, "t:cw", long_options, NULL)) + != -1) { + switch (c) { + case 't': { + struct target_map *t; + for (t = targets; t->name != NULL; t++) { + if (strcmp(t->name, optarg) == 0) { + cb.type = t->type; + break; + } + } + if (t->name == NULL) { + (void) fprintf(stderr, + gettext("invalid condense target '%s'\n"), + optarg); + usage(B_FALSE); + } + break; + } + case 'c': + cb.func = POOL_CONDENSE_CANCEL; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + if (cb.type == POOL_CONDENSE_TYPES) { + (void) fprintf(stderr, gettext("missing condense target\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + int error = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, + B_FALSE, condense_cb, &cb); + + return (error); +} + + /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. @@ -9767,6 +9882,55 @@ removal_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, } } +static void +condense_status_nvlist(nvlist_t *nvroot, status_cbdata_t *cb, nvlist_t *item) +{ + pool_condense_stat_t *pcnsp = NULL; + uint_t c; + + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CONDENSE_STATS, + (uint64_t **)&pcnsp, &c); + if (pcnsp == NULL || c == 0) + return; + + uint_t n = MIN(POOL_CONDENSE_TYPES, + c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t))); + + nvlist_t *cnv = fnvlist_alloc(); + + for (pool_condense_type_t type = 0; type < n; type++) { + pool_condense_stat_t *pcns = &pcnsp[type]; + if (pcns->pcns_start_time == 0) + continue; + + nvlist_t *nv = fnvlist_alloc(); + + nice_num_str_nvlist(nv, "start_time", + pcns->pcns_start_time, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICE_TIMESTAMP); + if (pcns->pcns_end_time > 0) + nice_num_str_nvlist(nv, "end_time", + pcns->pcns_end_time, cb->cb_literal, + cb->cb_json_as_int, ZFS_NICE_TIMESTAMP); + nice_num_str_nvlist(nv, "processed", + pcns->pcns_processed, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICENUM_1024); + nice_num_str_nvlist(nv, "total", + pcns->pcns_total, cb->cb_literal, cb->cb_json_as_int, + ZFS_NICENUM_1024); + fnvlist_add_string(nv, "unit", condense_type_unit_str[type]); + + fnvlist_add_nvlist(cnv, condense_type_nv_str[type], nv); + fnvlist_free(nv); + } + + if (fnvlist_num_pairs(cnv)) + fnvlist_add_nvlist(item, "condense", cnv); + + fnvlist_free(cnv); +} + + static void scan_status_nvlist(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nvroot, nvlist_t *item) @@ -10213,6 +10377,50 @@ print_checkpoint_status(pool_checkpoint_stat_t *pcs) space_buf); } +static void +print_condense_status(pool_condense_stat_t *pcnsp, uint_t n) +{ + if (pcnsp == NULL || n == 0) + return; + + for (pool_condense_type_t type = 0; type < n; type++) { + pool_condense_stat_t *pcns = &pcnsp[type]; + if (pcns->pcns_start_time == 0) + continue; + + const char *t = (type < POOL_CONDENSE_TYPES) ? + condense_type_str[type] : "[unknown type]"; + const char *u = (type < POOL_CONDENSE_TYPES) ? + condense_type_unit_str[type] : "items"; + + char cur[32], tot[32], elapsed[32]; + zfs_nicenum(pcns->pcns_processed, cur, sizeof (cur)); + zfs_nicenum(pcns->pcns_total, tot, sizeof (tot)); + + if (pcns->pcns_end_time == 0) { + secs_to_dhms(time(NULL) - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: condensing, %s/%s %s done in %s\n"), + t, cur, tot, u, elapsed); + } else if (pcns->pcns_processed < pcns->pcns_total) { + secs_to_dhms( + pcns->pcns_end_time - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: cancelled, %s/%s %s done in %s\n"), + t, cur, tot, u, elapsed); + } else { + secs_to_dhms( + pcns->pcns_end_time - pcns->pcns_start_time, + elapsed); + (void) printf(gettext( + "condense: %s: done, %s %s done in %s\n"), + t, cur, u, elapsed); + } + } +} + static void print_error_log(zpool_handle_t *zhp) { @@ -10742,6 +10950,7 @@ status_callback_json(zpool_handle_t *zhp, void *data) scan_status_nvlist(zhp, cbp, nvroot, item); removal_status_nvlist(zhp, cbp, nvroot, item); checkpoint_status_nvlist(nvroot, cbp, item); + condense_status_nvlist(nvroot, cbp, item); raidz_expand_status_nvlist(zhp, cbp, nvroot, item); vdev_stats_nvlist(zhp, cbp, nvroot, 0, B_FALSE, NULL, vds); if (cbp->cb_flat_vdevs) { @@ -10889,6 +11098,12 @@ status_callback(zpool_handle_t *zhp, void *data) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); print_raidz_expand_status(zhp, pres); + pool_condense_stat_t *pcnsp = NULL; + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t **)&pcnsp, &c); + print_condense_status(pcnsp, + c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t))); + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) diff --git a/include/libzfs.h b/include/libzfs.h index 01d51999f4eb..be6427338dd8 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -29,6 +29,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2024, Klara, Inc. */ #ifndef _LIBZFS_H @@ -297,6 +298,8 @@ _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, trimflags_t *); +_LIBZFS_H int zpool_condense(zpool_handle_t *, pool_condense_func_t, + pool_condense_type_t); _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); _LIBZFS_H int zpool_reguid(zpool_handle_t *); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index b1d74fbbc8f5..3cbee598dfd5 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -24,6 +24,7 @@ * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ #ifndef _LIBZFS_CORE_H @@ -142,6 +143,9 @@ _LIBZFS_CORE_H int lzc_channel_program_nosync(const char *, const char *, _LIBZFS_CORE_H int lzc_sync(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_reopen(const char *, boolean_t); +_LIBZFS_CORE_H int lzc_condense(const char *, + pool_condense_func_t, pool_condense_type_t); + _LIBZFS_CORE_H int lzc_pool_checkpoint(const char *); _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 1676020d04d3..69b1668ecc89 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -745,6 +745,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_CONDENSE_STATS "com.klarasystems:condense_stats" #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ @@ -1213,6 +1214,20 @@ typedef struct vdev_rebuild_stat { uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; +/* + * "Condense" is a general concept for fully writing down an intermediate log, + * journal or cache to its final resting place. The stats for condense are + * counts of how many things need to be written down and how many have been + * done so far, so that 'zpool status' can show progress. How it shows that + * depends on what the thing is. + */ +typedef struct pool_condense_stat { + uint64_t pcns_start_time; /* time_t */ + uint64_t pcns_end_time; /* time_t */ + uint64_t pcns_processed; /* items processed */ + uint64_t pcns_total; /* total items to process */ +} pool_condense_stat_t; + /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers @@ -1355,6 +1370,20 @@ typedef enum pool_trim_func { POOL_TRIM_FUNCS } pool_trim_func_t; +/* + * Condense functions. + */ +typedef enum pool_condense_func { + POOL_CONDENSE_START, + POOL_CONDENSE_CANCEL, + POOL_CONDENSE_FUNCS +} pool_condense_func_t; + +typedef enum pool_condense_type { + POOL_CONDENSE_LOG_SPACEMAP, + POOL_CONDENSE_TYPES, +} pool_condense_type_t; + /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. @@ -1534,6 +1563,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ + ZFS_IOC_POOL_CONDENSE, /* 0x5a5a */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1745,6 +1775,12 @@ typedef enum { #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" +/* + * The following are names used when invoking ZPOOL_IOC_POOL_CONDENSE. + */ +#define ZPOOL_CONDENSE_COMMAND "condense_command" +#define ZPOOL_CONDENSE_TYPE "condense_type" + /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index efe18d5410ff..d5bb653a5505 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -359,6 +359,7 @@ struct spa { avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; + spa_log_flushall_mode_t spa_log_flushall_mode; uint64_t spa_log_flushall_txg; @@ -473,6 +474,9 @@ struct spa { uint64_t spa_dedup_dsize; /* cached on-disk size of DDT */ uint64_t spa_dedup_class_full_txg; /* txg dedup class was full */ + /* stats for user-initiated condense operations */ + pool_condense_stat_t spa_condense_stats[POOL_CONDENSE_TYPES]; + /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ac9ae233c72d..f0f53ab5a564 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -478,6 +478,7 @@ + @@ -6013,6 +6014,19 @@ + + + + + + + + + + + + + @@ -6107,6 +6121,7 @@ + @@ -6271,6 +6286,12 @@ + + + + + + @@ -6852,6 +6873,12 @@ + + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f256535e8ea0..63cd41bc4484 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2021, 2023, 2024, Klara, Inc. */ #include @@ -4404,6 +4404,23 @@ zpool_sync_one(zpool_handle_t *zhp, void *data) return (0); } +int +zpool_condense(zpool_handle_t *zhp, + pool_condense_func_t func, pool_condense_type_t type) +{ + int ret; + + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + + if ((ret = lzc_condense(pool_name, func, type)) != 0) { + return (zpool_standard_error_fmt(hdl, ret, + dgettext(TEXT_DOMAIN, "condense '%s' failed"), pool_name)); + } + + return (0); +} + #define PATH_BUF_LEN 64 /* diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 6a9c20a2bb88..3ba1b67dc430 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -162,6 +162,7 @@ + @@ -1524,6 +1525,19 @@ + + + + + + + + + + + + + @@ -1618,6 +1632,7 @@ + @@ -2882,6 +2897,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index d07fca6cebad..84099530305c 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -26,6 +26,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -493,6 +494,23 @@ lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); } +int +lzc_condense(const char *pool_name, + pool_condense_func_t func, pool_condense_type_t type) +{ + int error; + + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_CONDENSE_COMMAND, (uint64_t)func); + fnvlist_add_uint64(args, ZPOOL_CONDENSE_TYPE, (uint64_t)type); + + error = lzc_ioctl(ZFS_IOC_POOL_CONDENSE, pool_name, args, NULL); + + fnvlist_free(args); + + return (error); +} + /* * Create "user holds" on snapshots. If there is a hold on a snapshot, * the snapshot can not be destroyed. (However, it can be marked for deletion diff --git a/man/Makefile.am b/man/Makefile.am index fde704933764..3ccbc22bfeec 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -69,6 +69,7 @@ dist_man_MANS = \ %D%/man8/zpool-attach.8 \ %D%/man8/zpool-checkpoint.8 \ %D%/man8/zpool-clear.8 \ + %D%/man8/zpool-condense.8 \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ diff --git a/man/man8/zpool-condense.8 b/man/man8/zpool-condense.8 new file mode 100644 index 000000000000..1c4a38de14cd --- /dev/null +++ b/man/man8/zpool-condense.8 @@ -0,0 +1,62 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara, Inc. +.\" +.Dd November 11, 2024 +.Dt ZPOOL-CONDENSE 8 +.Os +. +.Sh NAME +.Nm zpool-condense +.Nd Condense, flush, garbage collect or otherwise clean up pool metadata +.Sh SYNOPSIS +.Nm zpool +.Cm condense +.Fl t Ar target +.Op Fl c | w +.Ar pool +.Sh DESCRIPTION +Many internal pool metadata updates are performed in the background at a rate +chosen to limit the performance impact to normal use of the pool. +Sometimes it is desirable to accelerate these operations, +even if it affects overall performance. +.Sy condense +allows an operator to request that a specific background operation be +prioritised to complete as soon as possible. +.Pp +These are the possible values for +.Ar target : +.Bl -tag -compact -offset Ds -width "log-spacemap" +.It Sy log-spacemap +flushing log spacemap entries to their underlying metaslabs +.El +.Bl -tag -width Ds +.It Fl c +Cancel a previous +.Sy condense . +This will return background updates to their normal rate. +.It Fl w +Wait until the condense has completed before returning. +.El +.Sh SEE ALSO +.Xr zpool-status 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index b9b54185d050..11441d0f3c74 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -25,8 +25,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd February 14, 2024 +.Dd November 11, 2024 .Dt ZPOOL-STATUS 8 .Os . @@ -356,6 +357,7 @@ can be used to run a script on each VDEV. .Ed . .Sh SEE ALSO +.Xr zpool-condense 8 , .Xr zpool-events 8 , .Xr zpool-history 8 , .Xr zpool-iostat 8 , diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 50f947bab603..7c1a93757ff3 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -26,8 +26,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd May 27, 2021 +.Dd November 11, 2024 .Dt ZPOOL-WAIT 8 .Os . @@ -78,6 +79,8 @@ Scrub to cease Manual trim to cease .It Sy raidz_expand Attaching to a RAID-Z vdev to complete +.It Sy condense +Metadata condense operations to complete .El .Pp If an @@ -109,6 +112,7 @@ See . .Sh SEE ALSO .Xr zpool-checkpoint 8 , +.Xr zpool-condense 8 , .Xr zpool-initialize 8 , .Xr zpool-remove 8 , .Xr zpool-replace 8 , diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 02a258f66708..a45b0ccd8e82 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -25,8 +25,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024, Klara, Inc. .\" -.Dd February 14, 2024 +.Dd November 11, 2024 .Dt ZPOOL 8 .Os . @@ -177,6 +178,8 @@ specified. Prefetches specific types of pool data. .It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. +.It Xr zpool-condense 8 +Condense, flush, garbage collect or otherwise clean up pool metadata. .It Xr zpool-checkpoint 8 Checkpoints the current state of .Ar pool , diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 9d12bc2eb0a2..b65fc7236a57 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara, Inc. */ /* @@ -435,6 +436,14 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, sizeof (pres) / sizeof (uint64_t)); } + + pool_condense_stat_t pcns[POOL_CONDENSE_TYPES]; + memcpy(pcns, spa->spa_condense_stats, + sizeof (pool_condense_stat_t) * POOL_CONDENSE_TYPES); + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t *)pcns, + (sizeof (pool_condense_stat_t) / sizeof (uint64_t)) * + POOL_CONDENSE_TYPES); } static void diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 8188a9e46865..21973b664b3a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, 2023, 2024, Klara Inc. + * Copyright (c) 2019, 2021, 2023, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ @@ -7079,6 +7079,70 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) return (0); } +static const zfs_ioc_key_t zfs_keys_pool_condense[] = { + {ZPOOL_CONDENSE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_CONDENSE_TYPE, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_condense(const char *pool, nvlist_t *innvl, nvlist_t *onvl) +{ + spa_t *spa; + int err; + + uint64_t cmd; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONDENSE_COMMAND, &cmd) != 0) + return (SET_ERROR(EINVAL)); + + if (cmd >= POOL_CONDENSE_FUNCS) + return (SET_ERROR(EINVAL)); + + uint64_t type; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONDENSE_TYPE, &type) != 0) + return (SET_ERROR(EINVAL)); + + if (type >= POOL_CONDENSE_TYPES) + return (SET_ERROR(EINVAL)); + + if ((err = spa_open(pool, &spa, FTAG)) != 0) + return (err); + + if (spa_suspended(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EAGAIN)); + } + + if (!spa_writeable(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EROFS)); + } + + switch (type) { + case POOL_CONDENSE_LOG_SPACEMAP: + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + if (cmd == POOL_CONDENSE_START) + spa_log_flushall_start(spa, + SPA_LOG_FLUSHALL_REQUEST, 0); + else + spa_log_flushall_cancel(spa); + + break; + + default: + /* unreachable */ + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + spa_close(spa, FTAG); + + return (0); +} + /* * Load a user's wrapping key into the kernel. * innvl: { @@ -7426,6 +7490,10 @@ zfs_ioctl_init(void) zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); + zfs_ioctl_register("condense", ZFS_IOC_POOL_CONDENSE, + zfs_ioc_pool_condense, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_condense, ARRAY_SIZE(zfs_keys_pool_condense)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); From 963af683a3f5719bcfb91c6c801fcd5e1b8341af Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 4 Oct 2023 07:31:19 +1100 Subject: [PATCH 4/6] condense: add support for waiting for condense to complete Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- cmd/zpool/zpool_main.c | 42 +++++++++++++++++++++++++++++---- include/sys/fs/zfs.h | 1 + lib/libzfs/libzfs.abi | 3 ++- lib/libzfs_core/libzfs_core.abi | 3 ++- module/zfs/spa.c | 14 +++++++++++ module/zfs/spa_log_spacemap.c | 4 ++++ 6 files changed, 61 insertions(+), 6 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 35cc24f1ee45..72aa7148d4c2 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -561,7 +561,7 @@ get_usage(zpool_help_t idx) case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_CONDENSE: - return (gettext("\tcondense -t [-c] \n")); + return (gettext("\tcondense -t [-c | -w] \n")); case HELP_VERSION: return (gettext("\tversion [-j]\n")); case HELP_WAIT: @@ -8717,10 +8717,11 @@ condense_cb(zpool_handle_t *zhp, void *data) } /* - * zpool condense -t [-c] + * zpool condense -t [-c | -w] * * -t What to condense. * -c Cancel. Ends any in-progress condense. + * -w Wait. Blocks until condense has completed. * * Condense (flush) the log spacemap on the specified pool(s). */ @@ -8730,6 +8731,7 @@ zpool_do_condense(int argc, char **argv) struct option long_options[] = { {"target", required_argument, NULL, 't'}, {"cancel", no_argument, NULL, 'c'}, + {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; @@ -8745,6 +8747,7 @@ zpool_do_condense(int argc, char **argv) .func = POOL_CONDENSE_START, .type = POOL_CONDENSE_TYPES, }; + boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "t:cw", long_options, NULL)) @@ -8769,6 +8772,9 @@ zpool_do_condense(int argc, char **argv) case 'c': cb.func = POOL_CONDENSE_CANCEL; break; + case 'w': + wait = B_TRUE; + break; case '?': if (optopt != 0) { (void) fprintf(stderr, @@ -8796,9 +8802,20 @@ zpool_do_condense(int argc, char **argv) return (-1); } + if (wait && (cb.func != POOL_CONDENSE_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c\n")); + usage(B_FALSE); + } + int error = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, condense_cb, &cb); + if (wait && !error) { + zpool_wait_activity_t act = ZPOOL_WAIT_CONDENSE; + error = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, + B_FALSE, wait_callback, &act); + } + return (error); } @@ -13314,8 +13331,10 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; pool_raidz_expand_stat_t *pres = NULL; + pool_condense_stat_t *pcns = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", - "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; + "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND", + "CONDENSE"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ @@ -13384,6 +13403,21 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; } + /* + * Count each outstanding condense item as a "byte". Its not true, + * but its a counter, and it'll display nicely. + */ + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CONDENSE_STATS, (uint64_t **)&pcns, &c); + c = c / (sizeof (pool_condense_stat_t) / sizeof (uint64_t)); + if (pcns != NULL && c > 0) { + do { + c--; + bytes_rem[ZPOOL_WAIT_CONDENSE] += + (pcns[c].pcns_total - pcns[c].pcns_processed); + } while (c > 0); + } + bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = @@ -13522,7 +13556,7 @@ zpool_do_wait(int argc, char **argv) static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", - "raidz_expand" }; + "raidz_expand", "condense" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 69b1668ecc89..e52617ffaa30 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1688,6 +1688,7 @@ typedef enum { ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, + ZPOOL_WAIT_CONDENSE, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f0f53ab5a564..ffb9b6eadc6b 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -6147,7 +6147,8 @@ - + + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 3ba1b67dc430..915b5be392c2 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -1658,7 +1658,8 @@ - + + diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 4ffac89f70a2..e015924334a2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -10803,6 +10803,20 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); break; } + case ZPOOL_WAIT_CONDENSE: { + pool_condense_stat_t *pcns; + *in_progress = B_FALSE; + for (pool_condense_type_t type = 0; + type < POOL_CONDENSE_TYPES; type++) { + pcns = &spa->spa_condense_stats[type]; + if (pcns->pcns_start_time > 0 && + pcns->pcns_end_time == 0) { + *in_progress = B_TRUE; + break; + } + } + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index a1654bec9036..42a31f4f38f0 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -815,6 +815,8 @@ spa_log_flushall_done(spa_t *spa) spa->spa_log_flushall_mode = SPA_LOG_FLUSHALL_NONE; spa->spa_log_flushall_txg = 0; + + spa_notify_waiters(spa); } void @@ -832,6 +834,8 @@ spa_log_flushall_cancel(spa_t *spa) pool_condense_stat_t *pcns = &spa->spa_condense_stats[POOL_CONDENSE_LOG_SPACEMAP]; pcns->pcns_end_time = gethrestime_sec(); + + spa_notify_waiters(spa); } void From 94b13649d352053212a76e295de96d505b87330a Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 8 Mar 2024 12:31:29 +1100 Subject: [PATCH 5/6] ztest: periodically start/stop log spacemap flush Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- cmd/ztest.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 4a7959ebfca5..c55aab816750 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -26,7 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2023, Klara, Inc. + * Copyright (c) 2023, 2024, Klara, Inc. */ /* @@ -449,6 +449,8 @@ ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; ztest_func_t ztest_ddt_prune; +ztest_func_t ztest_spa_log_flushall_start; +ztest_func_t ztest_spa_log_flushall_cancel; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -506,6 +508,8 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_log_flushall_start, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_log_flushall_cancel, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6217,6 +6221,20 @@ ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) } } +void +ztest_spa_log_flushall_start(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_log_flushall_start(ztest_spa, SPA_LOG_FLUSHALL_REQUEST, 0); +} + +void +ztest_spa_log_flushall_cancel(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_log_flushall_cancel(ztest_spa); +} + void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { From 0aa7b7f493072a94c1ba9ce21b55669852c5badf Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 12 Nov 2024 14:54:39 +1100 Subject: [PATCH 6/6] zts: add test for log spacemap flushall + zpool condense Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../log_spacemap/log_spacemap_flushall.ksh | 80 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index fc4adc42d00a..f22d6f261c46 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -1054,7 +1054,7 @@ tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] -tests = ['log_spacemap_import_logs'] +tests = ['log_spacemap_import_logs', 'log_spacemap_flushall'] pre = post = tags = ['functional', 'log_spacemap'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7d1551a63f0d..d3b4aa20e94c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1612,6 +1612,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/longname/longname_002_pos.ksh \ functional/longname/longname_003_pos.ksh \ functional/longname/setup.ksh \ + functional/log_spacemap/log_spacemap_flushall.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh new file mode 100755 index 000000000000..10e4f80c03d3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_flushall.ksh @@ -0,0 +1,80 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2024, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: + +# This tests the on-demand "flush all spacemap logs" feature. This is the same +# process is that triggered at pool export, but instead we trigger it ahead of +# time via `zpool condense`. +# +# This test uses the `log_spacemaps` kstat and `zdb -m` to know how much is +# waiting to be flushed. All we're looking for is that the flushall function +# works, not how much it's doing. + +# +# STRATEGY: +# 1. Create pool. +# 2. Write things, which will add to the spacemap logs. +# 3. Save the counters. +# 4. Request the spacemap logs be flushed. +# 5. Compare counters against previous values. +# + +verify_runnable "global" + +function cleanup +{ + if poolexists $LOGSM_POOL; then + log_must zpool destroy -f $LOGSM_POOL + fi +} +log_onexit cleanup + +function get_smp_length { + zdb -m $LOGSM_POOL | grep smp_length | \ + awk '{ sum += $3 } END { print sum }' +} + +LOGSM_POOL="logsm_flushall" +read -r TESTDISK _ <<<"$DISKS" + +log_must zpool create -o cachefile=none -f -O compression=off \ + $LOGSM_POOL $TESTDISK + +log_must file_write -o create -f /$LOGSM_POOL/f1 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f2 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f3 -b 131072 -c 32 -d R +log_must file_write -o create -f /$LOGSM_POOL/f4 -b 131072 -c 32 -d R + +sync_all_pools + +typeset length_1=$(get_smp_length) + +log_must zpool condense -t log-spacemap -w $LOGSM_POOL + +typeset length_2=$(get_smp_length) + +log_must test $length_1 -gt $length_2 + +log_pass "Log spacemaps on-demand flushall works"