Skip to content

Commit

Permalink
Implement parallel dbuf eviction
Browse files Browse the repository at this point in the history
In the previous code, dbuf_evict_thread() would called dbuf_evict_one()
in a look while dbuf_cache_above_lowater().

dbuf_evict_one() would select a random sublist from the dbuf cache,
then walk it from the tail forward, attempting to acquire the lock on
each object until it succeeded, then evict that object and return.

As the name suggests, it would evict only a single object from the
cache. However, evicting one object is not likely to bring us below the
desired low water mark, so dbuf_evict_one() will be called again, where
it will loop over all of the same busy objects again, until it founds
one it can evict.

This has been replaced with dbuf_evict_many() which takes a specific
sublist as a parameter, as well as a desired amount of data to evict.
It then walks the sublist from the tail forward, evicting what it can
until the number of bytes evicted satisfies the input parameter or
the head of the sublist is reached.

The dbuf_evict_thread now runs is parallel as well, allowing it to
keep up with demand more easily. For the dbuf cache, if the single
thread was not able to keep up, ZFS would shift the work of evicting
some items to each incoming I/O thread. While that is still the case
it should be seen much less often now that dbuf_evict is more efficient
and no longer bottlenecked to a single thread.

Sponsored-by: Expensify, Inc.
Sponsored-by: Klara, Inc.
Co-authored-by: Allan Jude <[email protected]>
Co-authored-by: Mateusz Piotrowski <[email protected]>
Signed-off-by: Alexander Stetsenko <[email protected]>
Signed-off-by: Allan Jude <[email protected]>
Signed-off-by: Mateusz Piotrowski <[email protected]>
  • Loading branch information
3 people authored and alex-stetsenko committed Jan 20, 2025
1 parent 788e69c commit 5615676
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 15 deletions.
23 changes: 21 additions & 2 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd November 1, 2024
.Dt ZFS 4
.Os
Expand Down Expand Up @@ -75,6 +73,27 @@ When set to
.Sy 0
the array is dynamically sized based on total system memory.
.
.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq int
Controls the number of dbuf eviction threads to be used.
.Pp
When set to 0, ZFS will compute the number of required eviction threads
depending on the number of CPU cores (ncpu_max).
The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
Systems with 6 CPU cores get 2 eviction threads.
ZFS on systems larger than that uses log2 of the CPU count
plus one for each 64 CPUs.
This way the number of eviction threads scales up more on high CPU counts.
Currently, ZFS will not scale automatically beyond 16 threads.
.Pp
When set to 1, the parallel dbuf eviction is disabled.
Only one thread will be used to evict dbufs.
.Pp
When set to a value greater than 1, the value will be used as an exact number
of eviction threads.
If changed live, it will be limited by number of threads allocated on module
load.
.Pp
.
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed.
Expand Down
185 changes: 172 additions & 13 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static kmem_cache_t *dbuf_kmem_cache;
kmem_cache_t *dbuf_dirty_kmem_cache;
static taskq_t *dbu_evict_taskq;
static taskq_t *dbuf_evict_taskq;

static kthread_t *dbuf_cache_evict_thread;
static kmutex_t dbuf_evict_lock;
Expand Down Expand Up @@ -237,6 +238,24 @@ static uint_t dbuf_metadata_cache_shift = 6;
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
static uint_t dbuf_mutex_cache_shift = 0;

/*
* Controls the number of dbuf eviction threads.
* Possible values:
* 0 (auto) compute the number of threads using a logarithmic formula.
* 1 (disabled) one thread - parallel eviction is disabled.
* 2+ (manual) set the number manually, limited by dbuf_evict_threads_max.
*/
static uint_t dbuf_evict_threads = 0;

/*
* The number of allocated dbuf eviction threads. This limits the maximum value
* of dbuf_evict_threads.
* The number is set up at module load time and depends on the initial value of
* dbuf_evict_threads. If dbuf_evict_threads is set to auto, a logarithmic
* function is used to compute this value. Otherwise, it is set to max_ncpus.
*/
static uint_t dbuf_evict_threads_max;

static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);

Expand Down Expand Up @@ -768,26 +787,47 @@ dbuf_cache_above_lowater(void)
}

/*
* Evict the oldest eligible dbuf from the dbuf cache.
* Evict the oldest eligible dbufs from the dbuf cache.
* Use the multilist sublist (mls) with the provided index #idx.
*/
static void
dbuf_evict_one(void)
dbuf_evict_many(uint64_t bytes, unsigned int idx)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
int64_t evicted = 0;
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
marker->db_objset = NULL;

ASSERT3U(idx, <, multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache));

multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);

ASSERT(!MUTEX_HELD(&dbuf_evict_lock));

dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
multilist_sublist_insert_after(mls, db, marker);

while (db != NULL && evicted < bytes) {
int skip = 0;
while (db != NULL && (db->db_objset == NULL ||
mutex_tryenter(&db->db_mtx) == 0)) {
db = multilist_sublist_prev(mls, db);
if (skip == 0)
skip = 1;
}

DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
if (db == NULL)
break;

if (skip) {
multilist_sublist_remove(mls, marker);
multilist_sublist_insert_before(mls, db, marker);
}

DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);

if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size;
Expand All @@ -803,9 +843,97 @@ dbuf_evict_one(void)
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
DBUF_STAT_BUMP(cache_total_evicts);
evicted += size + usize;

mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
db = multilist_sublist_prev(mls, marker);
}

multilist_sublist_remove(mls, marker);
multilist_sublist_unlock(mls);
kmem_cache_free(dbuf_kmem_cache, marker);
}

typedef struct evict_arg {
taskq_ent_t tqe;
unsigned idx;
uint64_t bytes;
} evict_arg_t;

static void
dbuf_evict_task(void *arg)
{
evict_arg_t *eva = arg;
dbuf_evict_many(eva->bytes, eva->idx);
}

/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
*/
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)

static void
dbuf_evict(void)
{
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) -
dbuf_cache_lowater_bytes());

if (bytes <= 0)
return;

unsigned int num_sublists = multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache);
uint_t nthreads = MIN(num_sublists, (dbuf_evict_threads == 0 ?
dbuf_evict_threads_max :
MIN(dbuf_evict_threads, dbuf_evict_threads_max)));
boolean_t use_evcttq = nthreads > 1;
evict_arg_t *evarg = NULL;

if (use_evcttq) {
evarg = kmem_zalloc(sizeof (*evarg) * nthreads, KM_NOSLEEP);
/*
* Fall back to a regular single-threaded eviction.
*/
if (evarg == NULL)
use_evcttq = B_FALSE;
}

unsigned idx = multilist_get_random_index(
&dbuf_caches[DB_DBUF_CACHE].cache);

if (!use_evcttq)
return (dbuf_evict_many(bytes, idx));

/*
* Go to the parallel eviction.
*/
uint64_t evict;
uint_t ntasks;

if (bytes > nthreads * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(bytes, nthreads);
ntasks = nthreads;
} else {
multilist_sublist_unlock(mls);
evict = MIN_EVICT_SIZE;
ntasks = DIV_ROUND_UP(bytes, MIN_EVICT_SIZE);
}

for (unsigned i = 0; i < ntasks; i++) {
evarg[i].idx = idx;
evarg[i].bytes = evict;

taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task,
&evarg[i], 0, &evarg[i].tqe);

/* wrap idx */
if (++idx >= num_sublists)
idx = 0;
}

taskq_wait(dbuf_evict_taskq);
kmem_free(evarg, sizeof (*evarg) * nthreads);
}

/*
Expand Down Expand Up @@ -839,7 +967,7 @@ dbuf_evict_thread(void *unused)
* minimize lock contention.
*/
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
dbuf_evict_one();
dbuf_evict();
}

mutex_enter(&dbuf_evict_lock);
Expand All @@ -866,7 +994,7 @@ dbuf_evict_notify(uint64_t size)
*/
if (size > dbuf_cache_target_bytes()) {
if (size > dbuf_cache_hiwater_bytes())
dbuf_evict_one();
dbuf_evict();
cv_signal(&dbuf_evict_cv);
}
}
Expand Down Expand Up @@ -980,6 +1108,27 @@ dbuf_init(void)
* configuration is not required.
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
if (max_ncpus > 1) {
if (dbuf_evict_threads == 0) {
/*
* Limit the maximum number of threads by 16.
* We reach the limit when max_ncpu == 256.
*/
uint_t nthreads = MIN((highbit64(max_ncpus) - 1) +
max_ncpus / 32, 16);
dbuf_evict_threads_max = max_ncpus < 4 ? 1 :
nthreads;
} else {
dbuf_evict_threads_max = max_ncpus / 2;
}

if (dbuf_evict_threads_max > 1) {
dbuf_evict_taskq = taskq_create("dbuf_evict",
dbuf_evict_threads_max,
defclsyspri, 0, INT_MAX, TASKQ_PREPOPULATE);
}
}


for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
multilist_create(&dbuf_caches[dcs].cache,
Expand Down Expand Up @@ -1047,6 +1196,10 @@ dbuf_fini(void)
kmem_cache_destroy(dbuf_kmem_cache);
kmem_cache_destroy(dbuf_dirty_kmem_cache);
taskq_destroy(dbu_evict_taskq);
if (dbuf_evict_taskq != NULL) {
taskq_wait(dbuf_evict_taskq);
taskq_destroy(dbuf_evict_taskq);
}

mutex_enter(&dbuf_evict_lock);
dbuf_evict_thread_exit = B_TRUE;
Expand Down Expand Up @@ -4106,7 +4259,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
* +-----dbuf_destroy()<--dbuf_evict()<------------+
*
*/
void
Expand Down Expand Up @@ -5440,3 +5593,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
"Set size of dbuf cache mutex array as log2 shift.");

ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads, UINT, ZMOD_RW,
"Controls the number of dbuf eviction threads");

ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads_max, UINT, ZMOD_RD,
"The number of allocated dbuf eviction threads");

0 comments on commit 5615676

Please sign in to comment.