diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index dc474e3739f3..547cfb72d1c4 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -379,6 +379,7 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
+ VDEV_PROP_SIT_OUT_READS,
VDEV_NUM_PROPS
} vdev_prop_t;
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index abd66b8abc96..a9310f16fffb 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -285,6 +285,7 @@ struct vdev {
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
+ uint64_t vdev_last_latency_check;
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@@ -432,6 +433,9 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
+ uint64_t vdev_outlier_count; /* read outlier amongst peers */
+ uint64_t vdev_ewma_latency; /* moving average read latency */
+ hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */
/*
diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
index 64f484e9aa13..25f1bee72a27 100644
--- a/include/sys/vdev_raidz.h
+++ b/include/sys/vdev_raidz.h
@@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
+boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
index 45cb5864a22b..07f8e560c747 100644
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@@ -118,6 +118,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
+ uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
@@ -132,6 +133,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
+ int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 1f9fde6677d8..f2aef8754460 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5917,7 +5917,8 @@
-
+
+
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 64f9d1f6eb49..ec30d2a6ddd3 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
+ if (prop == VDEV_PROP_SIT_OUT_READS)
+ return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7078a5ba8373..c61303d28c91 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -501,6 +501,18 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value.
.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
For expanded RAID-Z, aggregate reads that have more rows than this.
.
+.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
+When a slow disk outlier is detected it is placed in a sit out state.
+While sitting out the disk will not participate in normal reads, instead its
+data will be reconstructed as needed from parity.
+Resilver and scrub operations will always read from a disk, even if it's
+sitting out.
+Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
+Writes will still be issued to a disk which is sitting out to maintain full
+redundancy.
+Defaults to 600 seconds and a value of zero disables slow disk outlier
+detection.
+.
.It Sy reference_history Ns = Ns Sy 3 Pq int
Maximum reference holders being tracked when reference_tracking_enable is
active.
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 34d4026b1009..844864518c1e 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -104,12 +104,19 @@ Comma separated list of children of this vdev
The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
+.It Sy sit_out_reads
+True when a slow disk outlier was detected and the vdev is currently in a sit
+out state.
+While sitting out, the vdev will not participate in normal reads, instead its
+data will be reconstructed as needed from parity.
.It Sy slow_ios
The number of slow I/Os encountered by this vdev,
These represent I/O operations that didn't complete in
.Sy zio_slow_io_ms
milliseconds
.Pq Sy 30000 No by default .
+Can also be incremented when a vdev was determined to be a raidz leaf latency
+outlier.
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index ea9eda4b316d..461fc7faefe2 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -466,6 +466,9 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0,
+ PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS",
+ boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5df2f77e5780..003caceb3328 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
+ atomic_store_64(&vd->vdev_outlier_count, 0);
+ vd->vdev_read_sit_out_expire = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
@@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
+ case VDEV_PROP_SIT_OUT_READS:
+ /* Only expose this for a draid or raidz leaf */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_top != NULL &&
+ (vd->vdev_top->vdev_ops ==
+ &vdev_raidz_ops ||
+ vd->vdev_top->vdev_ops ==
+ &vdev_draid_ops)) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vdev_sit_out_reads(vd, 0),
+ ZPROP_SRC_NONE);
+ }
+ continue;
case VDEV_PROP_TRIM_SUPPORT:
/* only valid for leaf vdevs */
if (vd->vdev_ops->vdev_op_leaf) {
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 419c8ac5bb28..7125612ae205 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1993,6 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
}
+ } else if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) > 0) {
+
+ for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_latency_outlier) {
+ rr->rr_missingdata++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
}
}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 6103f780e6bc..d761e8be9135 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -354,6 +354,13 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
*/
uint_t raidz_expand_pause_point = 0;
+/*
+ * This represents the duration for a slow drive read sit out.
+ */
+static unsigned long raidz_read_sit_out_secs = 600;
+
+static hrtime_t raid_outlier_check_interval_ms = 20;
+
/*
* Maximum amount of copy io's outstanding at once.
*/
@@ -2281,6 +2288,45 @@ vdev_raidz_min_asize(vdev_t *vd)
vd->vdev_children);
}
+/*
+ * return B_TRUE if a read should be skipped due to being too slow.
+ *
+ * In vdev_child_slow_outlier() it looks for outliers based on disk
+ * latency from the most recent child reads. Here we're checking if,
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
+ */
+boolean_t
+vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
+{
+ if (raidz_read_sit_out_secs == 0)
+ return (B_FALSE);
+
+ /* Avoid skipping a data column read when resilvering */
+ if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
+ return (B_FALSE);
+
+ return (vd->vdev_read_sit_out_expire >= gethrtime());
+}
+
+/*
+ * Calculate the Exponential Weighted Moving Average (EWMA)
+ * where
+ * alpha: the smoothing factor -- represented here as a scaled integer
+ * scale: the number of bits used to scale alpha
+ */
+static uint64_t
+calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) {
+ /*
+ * Scale using 16 bits with an effective alpha of 0.50
+ */
+ const uint64_t scale = 16;
+ const uint64_t alpha = 32768;
+
+ return (((alpha * latest_value) + (((1ULL << scale) - alpha) *
+ previous_ewma)) >> scale);
+}
+
void
vdev_raidz_child_done(zio_t *zio)
{
@@ -2290,6 +2336,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;
+
+ /*
+ * Process the disk io latency before it goes out of scope.
+ *
+ * A zio->io_delay value of zero means this IO was part of
+ * an aggregation.
+ */
+ if (zio->io_type == ZIO_TYPE_READ && zio->io_error == 0 &&
+ zio->io_size > 0 && zio->io_delay != 0) {
+ vdev_t *vd = zio->io_vd;
+ uint64_t previous_ewma = atomic_load_64(&vd->vdev_ewma_latency);
+ if (previous_ewma == 0)
+ previous_ewma = zio->io_delay;
+
+ atomic_store_64(&vd->vdev_ewma_latency,
+ calculate_ewma(previous_ewma, zio->io_delay));
+ }
}
static void
@@ -2445,6 +2508,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
+
+ if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) > 0) {
+
+ for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_latency_outlier) {
+ rr->rr_missingdata++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
+ }
+ }
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error || rc->rc_size == 0)
+ continue;
+
if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2468,6 +2565,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
ASSERT3U(prc->rc_devidx, ==, i);
vdev_t *cvd = vd->vdev_child[i];
+
if (!vdev_readable(cvd)) {
prc->rc_error = SET_ERROR(ENXIO);
prc->rc_tried = 1; /* don't even try */
@@ -2744,6 +2842,175 @@ vdev_raidz_worst_error(raidz_row_t *rr)
return (error);
}
+/*
+ * Find the median value from a set of n values
+ */
+static uint64_t
+latency_median_value(const uint64_t *data, size_t n)
+{
+ uint64_t m;
+
+ if (n % 2 == 0)
+ m = (data[(n>>1) - 1] + data[n>>1]) >> 1;
+ else
+ m = data[((n + 1) >> 1) - 1];
+
+ return (m);
+}
+
+/*
+ * Calculate the outlier fence from a set of n latency values
+ *
+ * fence = Q3 + 2 x (Q3 - Q1)
+ */
+static uint64_t
+latency_quartiles_fence(const uint64_t *data, size_t n)
+{
+ uint64_t q1, q3;
+
+ q1 = latency_median_value(&data[0], n>>1);
+ if (n % 2 == 0)
+ q3 = latency_median_value(&data[n>>1], n>>1);
+ else
+ q3 = latency_median_value(&data[(n+1) >> 1], n>>1);
+
+ uint64_t iqr = q3 - q1;
+ uint64_t fence = q3 + iqr;
+
+ return (fence);
+}
+
+#define LAT_SAMPLES_STACK 64
+#define LAT_SAMPLES_MIN 5
+#define LAT_OUTLIER_LIMIT 50
+
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+ const uint64_t *l1 = (uint64_t *)arg1;
+ const uint64_t *l2 = (uint64_t *)arg2;
+
+ return (TREE_CMP(*l1, *l2));
+}
+
+/*
+ * Check for any latency outlier from latest set of child reads.
+ *
+ * Uses a Tukey's fence, with K = 2, for detecting extreme outliers. This
+ * rule defines extreme outliers as data points outside the fence of the
+ * third quartile plus two times the Interquartile Range (IQR). This range
+ * is the distance between the first and third quartile.
+ */
+static void
+vdev_child_slow_outlier(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN)
+ return;
+
+ spa_t *spa = zio->io_spa;
+ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+ spa_load_state(spa) == SPA_LOAD_RECOVER ||
+ (spa_load_state(spa) != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)) {
+ return;
+ }
+
+ hrtime_t now = gethrtime();
+ uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
+
+ if ((now - last) < MSEC2NSEC(raid_outlier_check_interval_ms) ||
+ atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) {
+ return;
+ }
+
+ int samples = vd->vdev_children;
+ uint64_t data[LAT_SAMPLES_STACK];
+ uint64_t *lat_data;
+
+ if (samples > LAT_SAMPLES_STACK)
+ lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP);
+ else
+ lat_data = &data[0];
+
+ uint64_t max = 0;
+ uint64_t max_outier_count = 0;
+ vdev_t *svd = NULL; /* suspect vdev */
+ vdev_t *ovd = NULL; /* largest outlier vdev */
+ for (int c = 0; c < samples; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_read_sit_out_expire != 0) {
+ if (cvd->vdev_read_sit_out_expire < gethrtime()) {
+ /*
+ * Done with our sit out, wait for new outlier
+ * to emerge.
+ */
+ cvd->vdev_read_sit_out_expire = 0;
+ } else {
+ atomic_store_64(&cvd->vdev_ewma_latency, 0);
+ /* Only one sit out disk at a time for now */
+ goto out;
+ }
+ }
+
+ lat_data[c] = atomic_load_64(&cvd->vdev_ewma_latency);
+
+ /* wait until all disks have been read from */
+ if (lat_data[c] == 0)
+ goto out;
+
+ /* keep track of the vdev with largest value */
+ if (lat_data[c] > max) {
+ max = lat_data[c];
+ svd = cvd;
+ }
+
+ uint64_t count = atomic_load_64(&cvd->vdev_outlier_count);
+ if (count > max_outier_count) {
+ max_outier_count = count;
+ ovd = cvd;
+ }
+ }
+
+ qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
+ uint64_t fence = latency_quartiles_fence(lat_data, samples);
+ if (lat_data[samples - 1] > fence) {
+ /*
+ * Keep track of how many times this child has had
+ * an outlier read. A disk that persitently has a
+ * higher than peers outlier count will be considered
+ * a slow disk.
+ */
+ if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) >
+ LAT_OUTLIER_LIMIT && svd == ovd &&
+ svd->vdev_read_sit_out_expire == 0) {
+ /*
+ * Begin a sit out period for this slow drive
+ */
+ svd->vdev_read_sit_out_expire = gethrtime() +
+ SEC2NSEC(raidz_read_sit_out_secs);
+
+ /* count each slow io period */
+ mutex_enter(&svd->vdev_stat_lock);
+ svd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&svd->vdev_stat_lock);
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, svd,
+ NULL, NULL, 0);
+ vdev_dbgmsg(svd, "begin read sit out for %d secs",
+ (int)raidz_read_sit_out_secs);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ atomic_store_64(
+ &vd->vdev_child[c]->vdev_outlier_count, 0);
+ }
+ }
+ }
+out:
+ if (samples > LAT_SAMPLES_STACK)
+ kmem_free(lat_data, sizeof (uint64_t) * samples);
+}
+
static void
vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
{
@@ -3485,6 +3752,9 @@ vdev_raidz_io_done(zio_t *zio)
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);
}
+ /* Periodically check for a read outlier */
+ if (zio->io_type == ZIO_TYPE_READ)
+ vdev_child_slow_outlier(zio);
zio_checksum_verified(zio);
} else {
/*
@@ -5120,3 +5390,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW,
+ "Raidz/draid slow disk sit out time period in seconds");
+/* END CSTYLED */