diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index dc474e3739f3..102ffef016c5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -379,6 +379,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..a9310f16fffb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -285,6 +285,7 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -432,6 +433,9 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + uint64_t vdev_outlier_count; /* read outlier amongst peers */ + uint64_t vdev_ewma_latency; /* moving average read latency */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 64f484e9aa13..25f1bee72a27 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..07f8e560c747 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -118,6 +118,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -132,6 +133,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1f9fde6677d8..4f5dd1c983fc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5917,7 +5917,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 64f9d1f6eb49..dc0f0c53730c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); + if (prop == VDEV_PROP_SIT_OUT) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7078a5ba8373..9225996d2525 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -591,6 +591,18 @@ new format when enabling the feature. The default is to convert all log entries. . +.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Resilver and scrub operations will always read from a disk, even if it's +sitting out. +Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. +Defaults to 600 seconds and a value of zero disables slow disk outlier +detection. +. .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 34d4026b1009..229715c35d92 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -104,12 +104,23 @@ Comma separated list of children of this vdev The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev +.It Sy sit_out +True when a slow disk outlier was detected and the vdev is currently in a sit +out state. +While sitting out, the vdev will not participate in normal reads, instead its +data will be reconstructed as needed from parity. .It Sy slow_ios -The number of slow I/Os encountered by this vdev, -These represent I/O operations that didn't complete in +This indicates the number of slow I/O operations encountered by this vdev. +A slow I/O is defined as an operation that did not complete within the .Sy zio_slow_io_ms -milliseconds +threshold in milliseconds .Pq Sy 30000 No by default . +For +.Sy RAIDZ +and +.Sy DRAID +configurations, this value also represents the number of times the vdev was +identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index ea9eda4b316d..ef932ec4a0f6 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,6 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5df2f77e5780..f03200cdf86c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 419c8ac5bb28..326dfcabfb81 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1993,6 +1993,31 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } + } else if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6103f780e6bc..6d8385856998 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -354,6 +354,13 @@ unsigned long raidz_expand_max_reflow_bytes = 0; */ uint_t raidz_expand_pause_point = 0; +/* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long vdev_read_sit_out_secs = 600; + +static hrtime_t raid_outlier_check_interval_ms = 20; + /* * Maximum amount of copy io's outstanding at once. */ @@ -2281,6 +2288,45 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (vdev_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) + return (B_FALSE); + + return (vd->vdev_read_sit_out_expire >= gethrestime_sec()); +} + +/* + * Calculate the Exponential Weighted Moving Average (EWMA) + * where + * alpha: the smoothing factor -- represented here as a scaled integer + * scale: the number of bits used to scale alpha + */ +static uint64_t +calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) { + /* + * Scale using 8 bits with an effective alpha of 0.25 + */ + const uint64_t scale = 8; + const uint64_t alpha = 64; + + return (((alpha * latest_value) + (((1ULL << scale) - alpha) * + previous_ewma)) >> scale); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2290,6 +2336,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; + + /* + * Process the disk io latency before it goes out of scope. + * + * A zio->io_delay value of zero means this IO was part of + * an aggregation. + */ + if (vdev_read_sit_out_secs != 0 && zio->io_type == ZIO_TYPE_READ && + zio->io_error == 0 && zio->io_size > 0 && zio->io_delay != 0) { + vdev_t *vd = zio->io_vd; + uint64_t previous_ewma = atomic_load_64(&vd->vdev_ewma_latency); + if (previous_ewma == 0) + previous_ewma = zio->io_delay; + + atomic_store_64(&vd->vdev_ewma_latency, + calculate_ewma(previous_ewma, zio->io_delay)); + } } static void @@ -2445,6 +2508,42 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2468,6 +2567,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2744,6 +2844,161 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n>>1) - 1] + data[n>>1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + 2 x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n) +{ + uint64_t q1, q3; + + q1 = latency_median_value(&data[0], n>>1); + if (n % 2 == 0) + q3 = latency_median_value(&data[n>>1], n>>1); + else + q3 = latency_median_value(&data[(n+1) >> 1], n>>1); + + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the fence + * is at least 25% larger than Q1. + */ + uint64_t iqr = MAX(q3 - q1, q1 >> 3); + + return (q3 + (iqr << 1)); +} + +#define LAT_SAMPLES_STACK 64 +#define LAT_SAMPLES_MIN 5 +#define LAT_OUTLIER_LIMIT 50 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 2, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus two times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (vdev_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) + return; + + hrtime_t now = gethrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(raid_outlier_check_interval_ms) || + atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) { + return; + } + + int samples = vd->vdev_children; + uint64_t data[LAT_SAMPLES_STACK]; + uint64_t *lat_data; + + if (samples > LAT_SAMPLES_STACK) + lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP); + else + lat_data = &data[0]; + + uint64_t max = 0; + vdev_t *svd = NULL; /* suspect vdev */ + for (int c = 0; c < samples; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_read_sit_out_expire != 0) { + if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) { + /* + * Done with our sit out, wait for new outlier + * to emerge. + */ + cvd->vdev_read_sit_out_expire = 0; + } else { + atomic_store_64(&cvd->vdev_ewma_latency, 0); + /* Only one sit out disk at a time for now */ + goto out; + } + } + + lat_data[c] = atomic_load_64(&cvd->vdev_ewma_latency); + + /* wait until all disks have been read from */ + if (lat_data[c] == 0) + goto out; + + /* keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + } + } + + qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); + uint64_t fence = latency_quartiles_fence(lat_data, samples); + if (lat_data[samples - 1] > fence) { + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + if (++svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); + /* + * Begin a sit out period for this slow drive + */ + svd->vdev_read_sit_out_expire = gethrestime_sec() + + vdev_read_sit_out_secs; + + /* count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); + + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, svd, NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)vdev_read_sit_out_secs); + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; + } + } +out: + if (samples > LAT_SAMPLES_STACK) + kmem_free(lat_data, sizeof (uint64_t) * samples); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -2813,7 +3068,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " "offset=%llx", zio, c, rc->rc_devidx, (long long)rc->rc_offset); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -3485,6 +3739,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5120,3 +5377,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +/* END CSTYLED */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8a4a4b0f5cb8..1ed9478c40e6 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -702,6 +702,10 @@ tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', 'dio_unaligned_block', 'dio_unaligned_filesize'] tags = ['functional', 'direct'] +[tests/functional/events] +tests = ['slow_vdev_sit_out'] +tags = ['functional', 'events'] + [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] @@ -900,8 +904,8 @@ tags = ['functional', 'rename_dirs'] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', - 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', - 'scrub_cancel'] + 'replace_resilver', 'replace_resilver_sit_out' 'resilver_restart_001', + 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9cf919c3dd0f..9d77ea04aa3a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1109,6 +1109,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop="$1" + typeset pool="$2" + typeset vdev="$3" + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name @@ -1586,6 +1596,7 @@ function create_pool #pool devs_list if is_global_zone ; then [[ -d /$pool ]] && rm -rf /$pool + echo zpool create -f $pool $@ log_must zpool create -f $pool $@ fi @@ -1967,6 +1978,28 @@ function wait_vdev_state # pool disk state timeout return 1 } +# +# Wait for vdev 'sit_out' property to be cleared. +# +# $1 pool name +# $2 vdev name +# $3 timeout +# +function wait_sit_out #pool vdev timeout +{ + typeset pool=${1:-$TESTPOOL} + typeset vdev="$2" + typeset timeout=${3:-300} + for (( timer = 0; timer < $timeout; timer++ )); do + if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then + return 0 + fi + sleep 1; + done + + return 1 +} + # # Check the output of 'zpool status -v ', # and to see if the content of contain the specified. diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..9d3b9c4f2d65 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -69,6 +69,7 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes +READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index df183825dc68..42e7bce856d9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1497,6 +1497,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ + functional/events/slow_vdev_sit_out.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ @@ -1884,6 +1885,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ + functional/replacement/replace_resilver_sit_out.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh new file mode 100755 index 000000000000..64be51c70d5c --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Inject delays into one of the disks +# 3. Verify disk is set to 'sit out' for awhile. +# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + log_must zinject -c all + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 5 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=100 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {1..100} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=1M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10 + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" + destroy_pool $TESTPOOL2 +done + +log_pass "sit_out works correctly" diff --git a/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh new file mode 100755 index 000000000000..7e2dfdae6783 --- /dev/null +++ b/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh @@ -0,0 +1,184 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks while a disk is sitting out reads should pass +# +# STRATEGY: +# 1. Create raidz and draid pools +# 2. Make one disk slower and trigger a read sit out for that disk +# 3. Start some random I/O +# 4. Replace a disk in the pool with another disk. +# 5. Verify the integrity of the file system and the resilvering. +# + +verify_runnable "global" + +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 120 + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + log_must zinject -c all + + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O with a sit out completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset disk1=$1 + typeset disk2=$2 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + # replace disk with a slow drive still present + SECONDS=0 + log_must zpool replace -w $TESTPOOL1 $disk1 $disk2 + log_note took $SECONDS seconds to replace disk + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zinject -c all + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +DEVSIZE="150M" +specials_list="" +i=0 +while [[ $i != 10 ]]; do + log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +slow_disk=$TESTDIR/$TESTFILE1.3 +log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE + +# Test file size in MB +count=200 + +for type in "raidz2" "raidz3" "draid2"; do + + create_pool $TESTPOOL1 $type $specials_list + log_must zfs create -o primarycache=none -o recordsize=512K \ + $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count + + # Make one disk 100ms slower to trigger a sit out + log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1 + + # Do some reads and wait for sit out on slow disk + SECONDS=0 + typeset -i size=0 + for i in $(seq 1 $count) ; do + dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null + size=$i + + sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on" + log_note took $SECONDS seconds to reach sit out reading ${size}M + log_must zpool status -s $TESTPOOL1 + + replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE + + log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\"" + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 +done + +log_pass