Latest set of feedback changes (from Brian)

Signed-off-by: Don Brady <[email protected]>
openzfs · Jan 4, 2025 · cff27ce · cff27ce
1 parent 00e4044
commit cff27ce
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 90 deletions.
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
@@ -118,6 +118,7 @@ typedef struct raidz_col {
 	uint8_t rc_need_orig_restore:1;	/* need to restore from orig_data? */
 	uint8_t rc_force_repair:1;	/* Write good data to this column */
 	uint8_t rc_allow_repair:1;	/* Allow repair I/O to this column */
+	uint8_t rc_latency_outlier:1;	/* Latency outlier for this device */
 	int rc_shadow_devidx;		/* for double write during expansion */
 	int rc_shadow_error;		/* for double write during expansion */
 	uint64_t rc_shadow_offset;	/* for double write during expansion */
@@ -132,6 +133,7 @@ typedef struct raidz_row {
 	int rr_firstdatacol;		/* First data column/parity count */
 	abd_t *rr_abd_empty;		/* dRAID empty sector buffer */
 	int rr_nempty;			/* empty sectors included in parity */
+	int rr_noutliers;		/* Count of latency outlier devices */
 #ifdef ZFS_DEBUG
 	uint64_t rr_offset;		/* Logical offset for *_io_verify() */
 	uint64_t rr_size;		/* Physical size for *_io_verify() */

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
@@ -501,11 +501,15 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value.
 .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
 For expanded RAID-Z, aggregate reads that have more rows than this.
 .
-.It Sy raid_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
-For RAID-Z and dRAID only, this is the slow disk sit out time period in
-seconds.
-When a slow disk outlier is detected, then it gets placed in a sit out
-during reads for the duration of this time period.
+.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
+When a slow disk outlier is detected it is placed in a sit out state.
+While sitting out the disk will not participate in normal reads, instead its
+data will be reconstructed as needed from parity.
+Resilver and scrub operations will always read from a disk, even if it's
+sitting out.
+Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
+Writes will still be issued to a disk which is sitting out to maintain full
+redundancy.
 Defaults to 600 seconds and a value of zero disables slow disk outlier
 detection.
 .

diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
@@ -1889,17 +1889,6 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 	/* Sequential rebuild must do IO at redundancy group boundary. */
 	IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
 
-	/*
-	 * Calculate how much parity is available for sitting out reads
-	 */
-	int parity_avail = rr->rr_firstdatacol;
-	for (int p = 0; p < rr->rr_firstdatacol; p++) {
-		raidz_col_t *rc = &rr->rr_col[p];
-		if (!vdev_draid_readable(vd->vdev_child[rc->rc_devidx],
-		    rc->rc_offset)) {
-			parity_avail--;
-		}
-	}
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last.  Any errors along the way will force us to read the parity.
@@ -2004,14 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 				rc->rc_force_repair = 1;
 				rc->rc_allow_repair = 1;
 			}
-		} else if (parity_avail > 0 && c >= rr->rr_firstdatacol &&
-		    rr->rr_missingdata == 0 &&
-		    vdev_skip_latency_outlier(cvd, zio->io_flags)) {
-			rr->rr_missingdata++;
-			rc->rc_error = SET_ERROR(EAGAIN);
-			rc->rc_skipped = 1;
-			parity_avail--;
-			continue;
+		} else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
+			rr->rr_noutliers++;
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) > 0) {
+
+		for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_latency_outlier) {
+				rr->rr_missingdata++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
 		}
 	}
 

diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
@@ -357,7 +357,7 @@ uint_t raidz_expand_pause_point = 0;
 /*
  * This represents the duration for a slow drive read sit out.
  */
-static unsigned long raid_read_sit_out_secs = 600;
+static unsigned long raidz_read_sit_out_secs = 600;
 
 static hrtime_t raid_outlier_check_interval_ms = 20;
 
@@ -2293,12 +2293,13 @@ vdev_raidz_min_asize(vdev_t *vd)
  *
  * In vdev_child_slow_outlier() it looks for outliers based on disk
  * latency from the most recent child reads.  Here we're checking if,
- * over time, a disk has has been an outlier too many times.
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
  */
 boolean_t
 vdev_skip_latency_outlier(vdev_t *vd, zio_flag_t io_flags)
 {
-	if (raid_read_sit_out_secs == 0)
+	if (raidz_read_sit_out_secs == 0)
 		return (B_FALSE);
 
 	/* Avoid skipping a data column read when resilvering */
@@ -2482,18 +2483,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
-	/*
-	 * Calculate how much parity is available for sitting out reads
-	 */
-	int parity_avail = rr->rr_firstdatacol;
-	for (int p = 0; p < rr->rr_firstdatacol; p++) {
-		raidz_col_t *rc = &rr->rr_col[p];
-		if (rc->rc_size > 0 &&
-		    !vdev_readable(vd->vdev_child[rc->rc_devidx])) {
-			parity_avail--;
-		}
-	}
-
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
@@ -2513,19 +2502,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 			rc->rc_skipped = 1;
 			continue;
 		}
-		/*
-		 * Check if a data colummn read should be skipped
-		 */
-		if (parity_avail > 0 &&
-		    c >= rr->rr_firstdatacol &&
-		    rr->rr_missingdata == 0 &&
-		    vdev_skip_latency_outlier(cvd, zio->io_flags)) {
-			rr->rr_missingdata++;
-			rc->rc_error = SET_ERROR(EAGAIN);
-			rc->rc_skipped = 1;
-			parity_avail--;
-			continue;
-		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
@@ -2535,6 +2511,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 			rc->rc_skipped = 1;
 			continue;
 		}
+
+		if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
+			rr->rr_noutliers++;
+			rc->rc_latency_outlier = 1;
+		}
+	}
+
+	/*
+	 * When the row contains a latency outlier and sufficient parity
+	 * exists to reconstruct the column data, then skip reading the
+	 * known slow child vdev as a performance optimization.
+	 */
+	if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
+	    (rr->rr_firstdatacol - rr->rr_missingparity) > 0) {
+
+		for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+			raidz_col_t *rc = &rr->rr_col[c];
+
+			if (rc->rc_latency_outlier) {
+				rr->rr_missingdata++;
+				rc->rc_error = SET_ERROR(EAGAIN);
+				rc->rc_skipped = 1;
+				break;
+			}
+		}
+	}
+
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		if (rc->rc_error || rc->rc_size == 0)
+			continue;
+
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2570,12 +2580,6 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 			prc->rc_skipped = 1;
 			continue;
 		}
-		/* XXX is this a good place to be skipping reads? */
-		if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
-			prc->rc_error = SET_ERROR(EAGAIN);
-			prc->rc_skipped = 1;
-			continue;
-		}
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
@@ -2856,7 +2860,7 @@ vdev_check_outlier_state(vdev_t *vd)
 		 * Begin a sit out period for this slow drive
 		 */
 		vd->vdev_read_sit_out_expire = gethrtime() +
-		    SEC2NSEC(raid_read_sit_out_secs);
+		    SEC2NSEC(raidz_read_sit_out_secs);
 
 		/* count each slow io period */
 		mutex_enter(&vd->vdev_stat_lock);
@@ -2865,8 +2869,8 @@ vdev_check_outlier_state(vdev_t *vd)
 
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, vd->vdev_spa,
 		    vd, NULL, NULL, 0);
-		zfs_dbgmsg("vd-%d begin read sit out for %d secs",
-		    (int)vd->vdev_id, (int)raid_read_sit_out_secs);
+		vdev_dbgmsg(vd, "begin read sit out for %d secs",
+		    (int)raidz_read_sit_out_secs);
 
 		vdev_t *raidvd = vd->vdev_parent;
 		for (int c = 0; c < raidvd->vdev_children; c++) {
@@ -2919,22 +2923,18 @@ latency_quartiles_fence(const uint64_t *data, size_t n)
 	return (fence);
 }
 
-static void
-latency_sort(uint64_t *samples, int size) {
-	/* Insertion sort but size is small */
-	for (int i = 1; i < size; i++) {
-		uint64_t val = samples[i];
-		int j = i;
-		while (j > 0 && samples[j - 1] > val) {
-			samples[j] = samples[j - 1];
-			j--;
-		}
-		samples[j] = val;
-	}
-}
+#define	LAT_SAMPLES_STACK	32
+#define	LAT_SAMPLES_MIN		5
+#define	LAT_OUTLIER_LIMIT	50
 
-#define	STACK_SAMPLES	32
-#define	MIN_LAT_SAMPLES	5
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+	const uint64_t *l1 = (uint64_t *)arg1;
+	const uint64_t *l2 = (uint64_t *)arg2;
+
+	return (TREE_CMP(*l1, *l2));
+}
 
 /*
  * Check for any latency outlier from latest set of child reads.
@@ -2944,11 +2944,11 @@ latency_sort(uint64_t *samples, int size) {
  * third quartile plus two times the Interquartile Range (IQR). This range
  * is the distance between the first and third quartile.
  */
-noinline static void
+static void
 vdev_child_slow_outlier(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	if (raid_read_sit_out_secs == 0 || vd->vdev_children < MIN_LAT_SAMPLES)
+	if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN)
 		return;
 
 	spa_t *spa = zio->io_spa;
@@ -2968,10 +2968,10 @@ vdev_child_slow_outlier(zio_t *zio)
 	}
 
 	int samples = vd->vdev_children;
-	uint64_t data[STACK_SAMPLES];
+	uint64_t data[LAT_SAMPLES_STACK];
 	uint64_t *lat_data;
 
-	if (samples > STACK_SAMPLES)
+	if (samples > LAT_SAMPLES_STACK)
 		lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP);
 	else
 		lat_data = &data[0];
@@ -3007,22 +3007,22 @@ vdev_child_slow_outlier(zio_t *zio)
 		}
 	}
 
-	latency_sort(lat_data, samples);
+	qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
 	uint64_t fence = latency_quartiles_fence(lat_data, samples);
 	if (lat_data[samples - 1] > fence) {
 		/*
 		 * Keep track of how many times this child has had
 		 * an outlier read. A disk that persitently has a
-		 * higer than peers outlier count will be considered
+		 * higher than peers outlier count will be considered
 		 * a slow disk.
 		 */
-		if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > 50 &&
-		    svd == ovd) {
+		if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) >
+		    LAT_OUTLIER_LIMIT && svd == ovd) {
 			vdev_check_outlier_state(svd);
 		}
 	}
 out:
-	if (samples > STACK_SAMPLES)
+	if (samples > LAT_SAMPLES_STACK)
 		kmem_free(lat_data, sizeof (uint64_t) * samples);
 }
 
@@ -3766,11 +3766,10 @@ vdev_raidz_io_done(zio_t *zio)
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
-
-				/* Periodically check for a read outlier */
-				if (i == 0 && zio->io_type == ZIO_TYPE_READ)
-					vdev_child_slow_outlier(zio);
 			}
+			/* Periodically check for a read outlier */
+			if (zio->io_type == ZIO_TYPE_READ)
+				vdev_child_slow_outlier(zio);
 			zio_checksum_verified(zio);
 		} else {
 			/*
@@ -5406,6 +5405,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
-ZFS_MODULE_PARAM(zfs_vdev, raid_, read_sit_out_secs, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW,
 	"Raidz/draid slow disk sit out time period in seconds");
 /* END CSTYLED */