From c7a8a2e202cc1026a309be31a77bcb2875658bee Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 14 Jan 2025 16:38:37 -0800 Subject: [PATCH] Expand fragmentation table to reflect larger possibile allocation sizes When you are using large recordsizes in conjunction with raidz, with incompressible data, you can pretty reliably be making 21 MB allocations. Unfortunately, the fragmentation metric in ZFS considers any metaslabs with 16 MB free chunks completely unfragmented, so you can have a metaslab report 0% fragmented and be unable to satisfy an allocation. When using the segment-based metaslab weight, this is inconvenient; when using the space-based one, it can seriously degrade performance. We expand the fragmentation table to extend up to 512MB, and redefine the table size based on the actual table, rather than having a static define. We also tweak the one variable that depends on fragmentation directly. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie --- man/man4/zfs.4 | 2 +- module/zfs/metaslab.c | 57 +++++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index dd0b3d848fe9..9d83357fcc6d 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1778,7 +1778,7 @@ Normally disabled because these datasets may be missing key data. .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . -.It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint +.It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 77 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. An active metaslab that exceeds this threshold diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 7affbfac9dc7..353a99605913 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -146,7 +146,7 @@ static uint_t zfs_mg_fragmentation_threshold = 95; * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ -static uint_t zfs_metaslab_fragmentation_threshold = 70; +static uint_t zfs_metaslab_fragmentation_threshold = 77; /* * When set will load all metaslabs when pool is first opened. @@ -2889,8 +2889,6 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } -#define FRAGMENTATION_TABLE_SIZE 17 - /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done @@ -2901,33 +2899,40 @@ metaslab_fini(metaslab_t *msp) * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in - * large segments. A 10% change in fragmentation equates to approximately - * double the number of segments. + * large segments. * - * This table defines 0% fragmented space using 16MB segments. Testing has - * shown that segments that are greater than or equal to 16MB do not suffer - * from drastic performance problems. Using this value, we derive the rest - * of the table. Since the fragmentation value is never stored on disk, it - * is possible to change these calculations in the future. + * This table defines 0% fragmented space using 512M segments. Using this value, + * we derive the rest of the table. This table originally went up to 16MB, but + * with larger recordsizes, larger ashifts, and use of raidz3, it is possible + * to have significantly larger allocations than were previously possible. + * Since the fragmentation value is never stored on disk, it is possible to + * change these calculations in the future. */ -static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { +static const int zfs_frag_table[] = { 100, /* 512B */ - 100, /* 1K */ - 98, /* 2K */ - 95, /* 4K */ - 90, /* 8K */ - 80, /* 16K */ - 70, /* 32K */ - 60, /* 64K */ - 50, /* 128K */ - 40, /* 256K */ - 30, /* 512K */ - 20, /* 1M */ - 15, /* 2M */ - 10, /* 4M */ - 5, /* 8M */ - 0 /* 16M */ + 99, /* 1K */ + 97, /* 2K */ + 93, /* 4K */ + 88, /* 8K */ + 83, /* 16K */ + 77, /* 32K */ + 71, /* 64K */ + 64, /* 128K */ + 57, /* 256K */ + 50, /* 512K */ + 43, /* 1M */ + 36, /* 2M */ + 29, /* 4M */ + 23, /* 8M */ + 17, /* 16M */ + 12, /* 32M */ + 7, /* 64M */ + 3, /* 128M */ + 1, /* 256M */ + 0, /* 512M */ }; +#define FRAGMENTATION_TABLE_SIZE \ + (sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0]))) /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation.