diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index dc474e3739f3..102ffef016c5 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -379,6 +379,7 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
+ VDEV_PROP_SIT_OUT,
VDEV_NUM_PROPS
} vdev_prop_t;
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index abd66b8abc96..a9310f16fffb 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -285,6 +285,7 @@ struct vdev {
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
+ uint64_t vdev_last_latency_check;
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
@@ -432,6 +433,9 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
+ uint64_t vdev_outlier_count; /* read outlier amongst peers */
+ uint64_t vdev_ewma_latency; /* moving average read latency */
+ hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */
/*
diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
index 64f484e9aa13..25f1bee72a27 100644
--- a/include/sys/vdev_raidz.h
+++ b/include/sys/vdev_raidz.h
@@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
+boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
index 45cb5864a22b..07f8e560c747 100644
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@@ -118,6 +118,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
+ uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
@@ -132,6 +133,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
+ int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 1f9fde6677d8..4f5dd1c983fc 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -5917,7 +5917,8 @@
-
+
+
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 64f9d1f6eb49..dc0f0c53730c 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
+ if (prop == VDEV_PROP_SIT_OUT)
+ return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7078a5ba8373..9225996d2525 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -591,6 +591,18 @@ new format when enabling the
feature.
The default is to convert all log entries.
.
+.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
+When a slow disk outlier is detected it is placed in a sit out state.
+While sitting out the disk will not participate in normal reads, instead its
+data will be reconstructed as needed from parity.
+Resilver and scrub operations will always read from a disk, even if it's
+sitting out.
+Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
+Writes will still be issued to a disk which is sitting out to maintain full
+redundancy.
+Defaults to 600 seconds and a value of zero disables slow disk outlier
+detection.
+.
.It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
During top-level vdev removal, chunks of data are copied from the vdev
which may include free space in order to trade bandwidth for IOPS.
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index 34d4026b1009..229715c35d92 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -104,12 +104,23 @@ Comma separated list of children of this vdev
The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
+.It Sy sit_out
+True when a slow disk outlier was detected and the vdev is currently in a sit
+out state.
+While sitting out, the vdev will not participate in normal reads, instead its
+data will be reconstructed as needed from parity.
.It Sy slow_ios
-The number of slow I/Os encountered by this vdev,
-These represent I/O operations that didn't complete in
+This indicates the number of slow I/O operations encountered by this vdev.
+A slow I/O is defined as an operation that did not complete within the
.Sy zio_slow_io_ms
-milliseconds
+threshold in milliseconds
.Pq Sy 30000 No by default .
+For
+.Sy RAIDZ
+and
+.Sy DRAID
+configurations, this value also represents the number of times the vdev was
+identified as an outlier and excluded from participating in read I/O operations.
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index ea9eda4b316d..ef932ec4a0f6 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -466,6 +466,9 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
+ PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
+ sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5df2f77e5780..f03200cdf86c 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
+ atomic_store_64(&vd->vdev_outlier_count, 0);
+ vd->vdev_read_sit_out_expire = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
@@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
+ case VDEV_PROP_SIT_OUT:
+ /* Only expose this for a draid or raidz leaf */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_top != NULL &&
+ (vd->vdev_top->vdev_ops ==
+ &vdev_raidz_ops ||
+ vd->vdev_top->vdev_ops ==
+ &vdev_draid_ops)) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vdev_sit_out_reads(vd, 0),
+ ZPROP_SRC_NONE);
+ }
+ continue;
case VDEV_PROP_TRIM_SUPPORT:
/* only valid for leaf vdevs */
if (vd->vdev_ops->vdev_op_leaf) {
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 419c8ac5bb28..326dfcabfb81 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1993,6 +1993,31 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
}
+ } else if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ ASSERT0(rc->rc_latency_outlier);
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) >=
+ (rr->rr_missingdata + 1)) {
+
+ for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+ rr->rr_missingdata++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
}
}
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 6103f780e6bc..6d8385856998 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -354,6 +354,13 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
*/
uint_t raidz_expand_pause_point = 0;
+/*
+ * This represents the duration for a slow drive read sit out.
+ */
+static unsigned long vdev_read_sit_out_secs = 600;
+
+static hrtime_t raid_outlier_check_interval_ms = 20;
+
/*
* Maximum amount of copy io's outstanding at once.
*/
@@ -2281,6 +2288,45 @@ vdev_raidz_min_asize(vdev_t *vd)
vd->vdev_children);
}
+/*
+ * return B_TRUE if a read should be skipped due to being too slow.
+ *
+ * In vdev_child_slow_outlier() it looks for outliers based on disk
+ * latency from the most recent child reads. Here we're checking if,
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
+ */
+boolean_t
+vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
+{
+ if (vdev_read_sit_out_secs == 0)
+ return (B_FALSE);
+
+ /* Avoid skipping a data column read when scrubbing */
+ if (io_flags & ZIO_FLAG_SCRUB)
+ return (B_FALSE);
+
+ return (vd->vdev_read_sit_out_expire >= gethrestime_sec());
+}
+
+/*
+ * Calculate the Exponential Weighted Moving Average (EWMA)
+ * where
+ * alpha: the smoothing factor -- represented here as a scaled integer
+ * scale: the number of bits used to scale alpha
+ */
+static uint64_t
+calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) {
+ /*
+ * Scale using 8 bits with an effective alpha of 0.25
+ */
+ const uint64_t scale = 8;
+ const uint64_t alpha = 64;
+
+ return (((alpha * latest_value) + (((1ULL << scale) - alpha) *
+ previous_ewma)) >> scale);
+}
+
void
vdev_raidz_child_done(zio_t *zio)
{
@@ -2290,6 +2336,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_error = zio->io_error;
rc->rc_tried = 1;
rc->rc_skipped = 0;
+
+ /*
+ * Process the disk io latency before it goes out of scope.
+ *
+ * A zio->io_delay value of zero means this IO was part of
+ * an aggregation.
+ */
+ if (vdev_read_sit_out_secs != 0 && zio->io_type == ZIO_TYPE_READ &&
+ zio->io_error == 0 && zio->io_size > 0 && zio->io_delay != 0) {
+ vdev_t *vd = zio->io_vd;
+ uint64_t previous_ewma = atomic_load_64(&vd->vdev_ewma_latency);
+ if (previous_ewma == 0)
+ previous_ewma = zio->io_delay;
+
+ atomic_store_64(&vd->vdev_ewma_latency,
+ calculate_ewma(previous_ewma, zio->io_delay));
+ }
}
static void
@@ -2445,6 +2508,42 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
+
+ if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ ASSERT0(rc->rc_latency_outlier);
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) >=
+ (rr->rr_missingdata + 1)) {
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+ rr->rr_missingdata++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
+ }
+ }
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error || rc->rc_size == 0)
+ continue;
+
if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2468,6 +2567,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
ASSERT3U(prc->rc_devidx, ==, i);
vdev_t *cvd = vd->vdev_child[i];
+
if (!vdev_readable(cvd)) {
prc->rc_error = SET_ERROR(ENXIO);
prc->rc_tried = 1; /* don't even try */
@@ -2744,6 +2844,161 @@ vdev_raidz_worst_error(raidz_row_t *rr)
return (error);
}
+/*
+ * Find the median value from a set of n values
+ */
+static uint64_t
+latency_median_value(const uint64_t *data, size_t n)
+{
+ uint64_t m;
+
+ if (n % 2 == 0)
+ m = (data[(n>>1) - 1] + data[n>>1]) >> 1;
+ else
+ m = data[((n + 1) >> 1) - 1];
+
+ return (m);
+}
+
+/*
+ * Calculate the outlier fence from a set of n latency values
+ *
+ * fence = Q3 + 2 x (Q3 - Q1)
+ */
+static uint64_t
+latency_quartiles_fence(const uint64_t *data, size_t n)
+{
+ uint64_t q1, q3;
+
+ q1 = latency_median_value(&data[0], n>>1);
+ if (n % 2 == 0)
+ q3 = latency_median_value(&data[n>>1], n>>1);
+ else
+ q3 = latency_median_value(&data[(n+1) >> 1], n>>1);
+
+ /*
+ * To avoid detecting false positive outliers when N is small and
+ * and the latencies values are very close, make sure the fence
+ * is at least 25% larger than Q1.
+ */
+ uint64_t iqr = MAX(q3 - q1, q1 >> 3);
+
+ return (q3 + (iqr << 1));
+}
+
+#define LAT_SAMPLES_STACK 64
+#define LAT_SAMPLES_MIN 5
+#define LAT_OUTLIER_LIMIT 50
+
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+ const uint64_t *l1 = (uint64_t *)arg1;
+ const uint64_t *l2 = (uint64_t *)arg2;
+
+ return (TREE_CMP(*l1, *l2));
+}
+
+/*
+ * Check for any latency outlier from latest set of child reads.
+ *
+ * Uses a Tukey's fence, with K = 2, for detecting extreme outliers. This
+ * rule defines extreme outliers as data points outside the fence of the
+ * third quartile plus two times the Interquartile Range (IQR). This range
+ * is the distance between the first and third quartile.
+ */
+static void
+vdev_child_slow_outlier(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ if (vdev_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN)
+ return;
+
+ hrtime_t now = gethrtime();
+ uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
+
+ if ((now - last) < MSEC2NSEC(raid_outlier_check_interval_ms) ||
+ atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) {
+ return;
+ }
+
+ int samples = vd->vdev_children;
+ uint64_t data[LAT_SAMPLES_STACK];
+ uint64_t *lat_data;
+
+ if (samples > LAT_SAMPLES_STACK)
+ lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP);
+ else
+ lat_data = &data[0];
+
+ uint64_t max = 0;
+ vdev_t *svd = NULL; /* suspect vdev */
+ for (int c = 0; c < samples; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_read_sit_out_expire != 0) {
+ if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) {
+ /*
+ * Done with our sit out, wait for new outlier
+ * to emerge.
+ */
+ cvd->vdev_read_sit_out_expire = 0;
+ } else {
+ atomic_store_64(&cvd->vdev_ewma_latency, 0);
+ /* Only one sit out disk at a time for now */
+ goto out;
+ }
+ }
+
+ lat_data[c] = atomic_load_64(&cvd->vdev_ewma_latency);
+
+ /* wait until all disks have been read from */
+ if (lat_data[c] == 0)
+ goto out;
+
+ /* keep track of the vdev with largest value */
+ if (lat_data[c] > max) {
+ max = lat_data[c];
+ svd = cvd;
+ }
+ }
+
+ qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
+ uint64_t fence = latency_quartiles_fence(lat_data, samples);
+ if (lat_data[samples - 1] > fence) {
+ /*
+ * Keep track of how many times this child has had
+ * an outlier read. A disk that persitently has a
+ * higher than peers outlier count will be considered
+ * a slow disk.
+ */
+ if (++svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
+ ASSERT0(svd->vdev_read_sit_out_expire);
+ /*
+ * Begin a sit out period for this slow drive
+ */
+ svd->vdev_read_sit_out_expire = gethrestime_sec() +
+ vdev_read_sit_out_secs;
+
+ /* count each slow io period */
+ mutex_enter(&svd->vdev_stat_lock);
+ svd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&svd->vdev_stat_lock);
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ zio->io_spa, svd, NULL, NULL, 0);
+ vdev_dbgmsg(svd, "begin read sit out for %d secs",
+ (int)vdev_read_sit_out_secs);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vd->vdev_child[c]->vdev_outlier_count = 0;
+ }
+ }
+out:
+ if (samples > LAT_SAMPLES_STACK)
+ kmem_free(lat_data, sizeof (uint64_t) * samples);
+}
+
static void
vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
{
@@ -2813,7 +3068,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
"offset=%llx",
zio, c, rc->rc_devidx, (long long)rc->rc_offset);
-
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE,
@@ -3485,6 +3739,9 @@ vdev_raidz_io_done(zio_t *zio)
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);
}
+ /* Periodically check for a read outlier */
+ if (zio->io_type == ZIO_TYPE_READ)
+ vdev_child_slow_outlier(zio);
zio_checksum_verified(zio);
} else {
/*
@@ -5120,3 +5377,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
+ "Raidz/draid slow disk sit out time period in seconds");
+/* END CSTYLED */
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 8a4a4b0f5cb8..1ed9478c40e6 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -702,6 +702,10 @@ tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
'dio_unaligned_block', 'dio_unaligned_filesize']
tags = ['functional', 'direct']
+[tests/functional/events]
+tests = ['slow_vdev_sit_out']
+tags = ['functional', 'events']
+
[tests/functional/exec]
tests = ['exec_001_pos', 'exec_002_neg']
tags = ['functional', 'exec']
@@ -900,8 +904,8 @@ tags = ['functional', 'rename_dirs']
tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
'attach_resilver', 'detach', 'rebuild_disabled_feature',
'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
- 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
- 'scrub_cancel']
+ 'replace_resilver', 'replace_resilver_sit_out' 'resilver_restart_001',
+ 'resilver_restart_002', 'scrub_cancel']
tags = ['functional', 'replacement']
[tests/functional/reservation]
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 9cf919c3dd0f..9d77ea04aa3a 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -1109,6 +1109,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
}
+# Get the specified vdev property in parsable format or fail
+function get_vdev_prop
+{
+ typeset prop="$1"
+ typeset pool="$2"
+ typeset vdev="$3"
+
+ zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
+}
+
# Return 0 if a pool exists; $? otherwise
#
# $1 - pool name
@@ -1586,6 +1596,7 @@ function create_pool #pool devs_list
if is_global_zone ; then
[[ -d /$pool ]] && rm -rf /$pool
+ echo zpool create -f $pool $@
log_must zpool create -f $pool $@
fi
@@ -1967,6 +1978,28 @@ function wait_vdev_state # pool disk state timeout
return 1
}
+#
+# Wait for vdev 'sit_out' property to be cleared.
+#
+# $1 pool name
+# $2 vdev name
+# $3 timeout
+#
+function wait_sit_out #pool vdev timeout
+{
+ typeset pool=${1:-$TESTPOOL}
+ typeset vdev="$2"
+ typeset timeout=${3:-300}
+ for (( timer = 0; timer < $timeout; timer++ )); do
+ if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then
+ return 0
+ fi
+ sleep 1;
+ done
+
+ return 1
+}
+
#
# Check the output of 'zpool status -v ',
# and to see if the content of contain the specified.
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 2024c44cc138..9d3b9c4f2d65 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -69,6 +69,7 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes
+READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index df183825dc68..42e7bce856d9 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1497,6 +1497,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/events_001_pos.ksh \
functional/events/events_002_pos.ksh \
functional/events/setup.ksh \
+ functional/events/slow_vdev_sit_out.ksh \
functional/events/zed_cksum_config.ksh \
functional/events/zed_cksum_reported.ksh \
functional/events/zed_diagnose_multiple.ksh \
@@ -1884,6 +1885,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/replacement/replace_import.ksh \
functional/replacement/replace_rebuild.ksh \
functional/replacement/replace_resilver.ksh \
+ functional/replacement/replace_resilver_sit_out.ksh \
functional/replacement/resilver_restart_001.ksh \
functional/replacement/resilver_restart_002.ksh \
functional/replacement/scrub_cancel.ksh \
diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh
new file mode 100755
index 000000000000..64be51c70d5c
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh
@@ -0,0 +1,91 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
+
+# DESCRIPTION:
+# Verify that vdevs 'sit out' when they are slow
+#
+# STRATEGY:
+# 1. Create various raidz/draid pools
+# 2. Inject delays into one of the disks
+# 3. Verify disk is set to 'sit out' for awhile.
+# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+function cleanup
+{
+ restore_tunable READ_SIT_OUT_SECS
+ log_must zinject -c all
+ destroy_pool $TESTPOOL2
+ log_must rm -f $TEST_BASE_DIR/vdev.$$.*
+}
+
+log_assert "Verify sit_out works"
+
+log_onexit cleanup
+
+# shorten sit out period for testing
+save_tunable READ_SIT_OUT_SECS
+set_tunable32 READ_SIT_OUT_SECS 5
+
+log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}
+
+for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do
+ log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
+ log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=100
+ log_must zpool export $TESTPOOL2
+ log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2
+
+ BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9
+
+ # Initial state should not be sitting out
+ log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]]
+
+ # Delay our reads 200ms to trigger sit out
+ log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2
+
+ # Do some reads and wait for us to sit out
+ for i in {1..100} ; do
+ dd if=/$TESTPOOL2/bigfile skip=$i bs=1M count=1 of=/dev/null
+
+ sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)
+ if [[ "$sit_out" == "on" ]] ; then
+ break
+ fi
+ done
+
+ log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on"
+
+ # Clear fault injection
+ log_must zinject -c all
+
+ # Wait for us to exit our sit out period
+ log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10
+
+ log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off"
+ destroy_pool $TESTPOOL2
+done
+
+log_pass "sit_out works correctly"
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh
new file mode 100755
index 000000000000..7e2dfdae6783
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh
@@ -0,0 +1,184 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Replacing disks while a disk is sitting out reads should pass
+#
+# STRATEGY:
+# 1. Create raidz and draid pools
+# 2. Make one disk slower and trigger a read sit out for that disk
+# 3. Start some random I/O
+# 4. Replace a disk in the pool with another disk.
+# 5. Verify the integrity of the file system and the resilvering.
+#
+
+verify_runnable "global"
+
+save_tunable READ_SIT_OUT_SECS
+set_tunable32 READ_SIT_OUT_SECS 120
+
+function cleanup
+{
+ restore_tunable READ_SIT_OUT_SECS
+ log_must zinject -c all
+
+ if [[ -n "$child_pids" ]]; then
+ for wait_pid in $child_pids
+ do
+ kill $wait_pid
+ done
+ fi
+
+ if poolexists $TESTPOOL1; then
+ destroy_pool $TESTPOOL1
+ fi
+
+ [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O with a sit out completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function replace_test
+{
+ typeset -i iters=2
+ typeset disk1=$1
+ typeset disk2=$2
+
+ typeset i=0
+ while [[ $i -lt $iters ]]; do
+ log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i"
+ file_trunc $options $TESTDIR/$TESTFILE.$i &
+ typeset pid=$!
+
+ sleep 1
+
+ child_pids="$child_pids $pid"
+ ((i = i + 1))
+ done
+
+ # replace disk with a slow drive still present
+ SECONDS=0
+ log_must zpool replace -w $TESTPOOL1 $disk1 $disk2
+ log_note took $SECONDS seconds to replace disk
+
+ for wait_pid in $child_pids
+ do
+ kill $wait_pid
+ done
+ child_pids=""
+
+ log_must zinject -c all
+ log_must zpool export $TESTPOOL1
+ log_must zpool import -d $TESTDIR $TESTPOOL1
+ log_must zfs umount $TESTPOOL1/$TESTFS1
+ log_must zdb -cdui $TESTPOOL1/$TESTFS1
+ log_must zfs mount $TESTPOOL1/$TESTFS1
+ verify_pool $TESTPOOL1
+}
+
+DEVSIZE="150M"
+specials_list=""
+i=0
+while [[ $i != 10 ]]; do
+ log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i
+ specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+ ((i = i + 1))
+done
+
+slow_disk=$TESTDIR/$TESTFILE1.3
+log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE
+
+# Test file size in MB
+count=200
+
+for type in "raidz2" "raidz3" "draid2"; do
+
+ create_pool $TESTPOOL1 $type $specials_list
+ log_must zfs create -o primarycache=none -o recordsize=512K \
+ $TESTPOOL1/$TESTFS1
+ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+ log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count
+
+ # Make one disk 100ms slower to trigger a sit out
+ log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1
+
+ # Do some reads and wait for sit out on slow disk
+ SECONDS=0
+ typeset -i size=0
+ for i in $(seq 1 $count) ; do
+ dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null
+ size=$i
+
+ sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)
+ if [[ "$sit_out" == "on" ]] ; then
+ break
+ fi
+ done
+ log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on"
+ log_note took $SECONDS seconds to reach sit out reading ${size}M
+ log_must zpool status -s $TESTPOOL1
+
+ replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+ log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\""
+
+ destroy_pool $TESTPOOL1
+ log_must rm -rf /$TESTPOOL1
+done
+
+log_pass