From c15ee614a1a582d7a8bd290000e6ae01c84268d8 Mon Sep 17 00:00:00 2001
From: Alexander Stetsenko <alex.stetsenko@gmail.com>
Date: Thu, 22 Aug 2024 15:35:20 -0400
Subject: [PATCH] Implement parallel ARC eviction

Read and write performance can become limited by the arc_evict
process being single threaded. Additional data cannot be added
to the ARC until sufficient existing data is evicted.

On many-core systems with TBs of RAM, a single thread becomes
a significant bottleneck.

With the change we see a 25% increase in read and write throughput

Sponsored-by: Expensify, Inc.
Sponsored-by: Klara, Inc.
Co-authored-by: Allan Jude <allan@klarasystems.com>
Co-authored-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com>
Signed-off-by: Alexander Stetsenko <alex.stetsenko@klarasystems.com>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Mateusz Piotrowski <mateusz.piotrowski@klarasystems.com>
---
 man/man4/zfs.4   |  26 +++++++-
 module/zfs/arc.c | 160 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 176 insertions(+), 10 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7078a5ba8373..2743b9bd705b 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -16,8 +16,6 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.\" Copyright (c) 2024, Klara, Inc.
-.\"
 .Dd November 1, 2024
 .Dt ZFS 4
 .Os
@@ -724,6 +722,30 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
 This batch-style operation prevents entire sub-lists from being evicted at once
 but comes at a cost of additional unlocking and locking.
 .
+.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
+Controls the number of ARC eviction threads to be used.
+.Pp
+When set to 0, ZFS will compute the number of required eviction threads
+depending on the number of CPU cores (ncpu_max).
+The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
+Systems with 6 CPU cores get 2 eviction threads.
+ZFS on systems larger than that uses log2 of the CPU count
+plus one for each 64 CPUs.
+This way the number of eviction threads scales up more on high CPU counts.
+Currently, ZFS will not scale automatically beyond 16 threads.
+.Pp
+When set to 1, the parallel arc eviction is disabled.
+Only one thread will be used to evict from ARC.
+.Pp
+When set to a value greater than 1, the value will be used as an exact number
+of eviction threads.
+If changed live, it will be limited by number of threads allocated on module
+load.
+.Pp
+More threads may improve the responsiveness of ZFS to memory pressure.
+This can be important for performance when eviction from the ARC becomes
+a bottleneck for reads and writes.
+.
 .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
 If set to a non zero value, it will replace the
 .Sy arc_grow_retry
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index bd6f076dfbbd..735ce2324375 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -468,6 +468,24 @@ static int zfs_arc_prune_task_threads = 1;
 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
 static taskq_t *arc_flush_taskq;
 
+/*
+ * Controls the number of ARC eviction threads.
+ * Possible values:
+ * 0  (auto) compute the number of threads using a logarithmic formula.
+ * 1  (disabled) one thread - parallel eviction is disabled.
+ * 2+ (manual) set the number manually, limited by zfs_arc_evict_threads_max.
+ */
+static uint_t zfs_arc_evict_threads = 0;
+
+/*
+ * The number of allocated ARC eviction threads. This limits the maximum value
+ * of zfs_arc_evict_threads.
+ * The number is set up at module load time and depends on the initial value of
+ * zfs_arc_evict_threads. If zfs_arc_evict_threads is set to auto, a logarithmic
+ * function is used to compute this value. Otherwise, it is set to max_ncpus.
+ */
+static uint_t zfs_arc_evict_threads_max;
+
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
@@ -3911,7 +3929,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
-		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
@@ -3921,11 +3938,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
-		if (hdr->b_spa == 0)
+		if (hdr->b_spa == 0) {
+			multilist_sublist_move_forward(mls, marker);
 			continue;
+		}
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
+			multilist_sublist_move_forward(mls, marker);
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
@@ -3960,6 +3980,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 				evict_count--;
 
 		} else {
+			multilist_sublist_move_forward(mls, marker);
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
@@ -4047,6 +4068,40 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
+static taskq_t *arc_evict_taskq;
+
+typedef struct evict_arg {
+	taskq_ent_t	tqe;
+	multilist_t	*ml;
+	int		idx;
+	arc_buf_hdr_t	*marker;
+	uint64_t	spa;
+	uint64_t	bytes;
+	uint64_t	*evicted_ptr;
+} evict_arg_t;
+
+static void
+arc_evict_task(void *arg)
+{
+	evict_arg_t *eva = arg;
+	uint64_t *evictedp = eva->evicted_ptr;
+	multilist_t *ml = eva->ml;
+	arc_buf_hdr_t *marker = eva->marker;
+	int idx = eva->idx;
+	uint64_t spa = eva->spa;
+	uint64_t evict = eva->bytes;
+	uint64_t bytes_evicted = arc_evict_state_impl(ml, idx, marker, spa,
+	    evict);
+	atomic_add_64(evictedp, bytes_evicted);
+}
+
+/*
+ * The minimum number of bytes we can evict at once is a block size.
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
+ * We use this value to compute a scaling factor for the eviction tasks.
+ */
+#define	MIN_EVICT_SIZE	(SPA_MAXBLOCKSIZE)
+
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
@@ -4066,10 +4121,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
-	int num_sublists;
 	arc_buf_hdr_t **markers;
+	evict_arg_t *evarg = NULL;
+	unsigned num_sublists = multilist_get_num_sublists(ml);
+	uint_t nthreads = MIN(num_sublists, (zfs_arc_evict_threads == 0 ?
+	    zfs_arc_evict_threads_max :
+	    MIN(zfs_arc_evict_threads, zfs_arc_evict_threads_max)));
+	boolean_t use_evcttq = nthreads > 1;
 
-	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
@@ -4092,6 +4151,16 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 		multilist_sublist_unlock(mls);
 	}
 
+	if (use_evcttq) {
+		evarg = kmem_alloc(sizeof (*evarg) * nthreads, KM_NOSLEEP);
+		/*
+		 * Fall back to the regular single evict if it is not possible
+		 * to allocate memory for the task queue entries.
+		 */
+		if (evarg == NULL)
+			use_evcttq = B_FALSE;
+	}
+
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
@@ -4099,6 +4168,24 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
+		uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
+		    bytes - total_evicted);
+		uint64_t evict;
+		uint_t ntasks;
+
+		if (use_evcttq) {
+			if (left > nthreads * MIN_EVICT_SIZE) {
+				evict = DIV_ROUND_UP(left, nthreads);
+				ntasks = nthreads;
+			} else {
+				evict = MIN_EVICT_SIZE;
+				ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
+			}
+		} else {
+			ntasks = num_sublists;
+			evict = left;
+		}
+
 
 		/*
 		 * Start eviction using a randomly selected sublist,
@@ -4107,10 +4194,29 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
-		for (int i = 0; i < num_sublists; i++) {
+		for (unsigned i = 0; i < ntasks; i++, sublist_idx++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
+			/* we've reached the end, wrap to the beginning */
+			if (sublist_idx >= num_sublists)
+				sublist_idx = 0;
+
+			if (use_evcttq) {
+				taskq_init_ent(&evarg[i].tqe);
+				evarg[i].ml = ml;
+				evarg[i].marker = markers[sublist_idx];
+				evarg[i].spa = spa;
+				evarg[i].evicted_ptr = &scan_evicted;
+				evarg[i].idx = sublist_idx;
+				evarg[i].bytes = evict;
+
+				taskq_dispatch_ent(arc_evict_taskq,
+				    arc_evict_task,
+				    &evarg[i], 0, &evarg[i].tqe);
+				continue;
+			}
+
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
@@ -4121,10 +4227,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
+		}
 
-			/* we've reached the end, wrap to the beginning */
-			if (++sublist_idx >= num_sublists)
-				sublist_idx = 0;
+		if (use_evcttq) {
+			taskq_wait(arc_evict_taskq);
+			total_evicted += scan_evicted;
 		}
 
 		/*
@@ -4151,11 +4258,15 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
 		}
 	}
 
+	if (use_evcttq)
+		kmem_free(evarg, sizeof (*evarg) * nthreads);
+
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
+
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);
 
@@ -7789,6 +7900,7 @@ arc_set_limits(uint64_t allmem)
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
+
 void
 arc_init(void)
 {
@@ -7866,6 +7978,27 @@ arc_init(void)
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
+	if (max_ncpus > 1) {
+		if (zfs_arc_evict_threads == 0) {
+			/*
+			 * Limit the maximum number of threads by 16.
+			 * We reach the limit when max_ncpu == 256.
+			 */
+			uint_t nthreads = MIN((highbit64(max_ncpus) - 1) +
+			    max_ncpus / 32, 16);
+			zfs_arc_evict_threads_max = max_ncpus < 4 ? 1 :
+			    nthreads;
+		} else {
+			zfs_arc_evict_threads_max = max_ncpus / 2;
+		}
+
+		if (zfs_arc_evict_threads_max > 1) {
+			arc_evict_taskq = taskq_create("arc_evict",
+			    zfs_arc_evict_threads_max,
+			    defclsyspri, 0, INT_MAX, TASKQ_PREPOPULATE);
+		}
+	}
+
 	list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
 	    offsetof(arc_async_flush_t, af_node));
 	mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -7949,6 +8082,11 @@ arc_fini(void)
 		arc_ksp = NULL;
 	}
 
+	if (arc_evict_taskq != NULL) {
+		taskq_wait(arc_evict_taskq);
+		taskq_destroy(arc_evict_taskq);
+	}
+
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
@@ -11094,3 +11232,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RW,
+	"Controls the number of ARC eviction threads");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads_max, UINT, ZMOD_RD,
+	"The number of allocated ARC eviction threads");