From cff27cea27462a44226213ca6395bab2120dae22 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Sat, 4 Jan 2025 05:50:59 +0000 Subject: [PATCH] Latest set of feedback changes (from Brian) Signed-off-by: Don Brady --- include/sys/vdev_raidz_impl.h | 2 + man/man4/zfs.4 | 14 ++-- module/zfs/vdev_draid.c | 42 ++++++----- module/zfs/vdev_raidz.c | 131 +++++++++++++++++----------------- 4 files changed, 99 insertions(+), 90 deletions(-) diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..8b992e95dbb6 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -118,6 +118,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -132,6 +133,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_noutliers; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index bd7552e59121..214c3e1c8baa 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -501,11 +501,15 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value. .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong For expanded RAID-Z, aggregate reads that have more rows than this. . -.It Sy raid_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong -For RAID-Z and dRAID only, this is the slow disk sit out time period in -seconds. -When a slow disk outlier is detected, then it gets placed in a sit out -during reads for the duration of this time period. +.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Resilver and scrub operations will always read from a disk, even if it's +sitting out. +Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. Defaults to 600 seconds and a value of zero disables slow disk outlier detection. . diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index aae8acced89a..20b102f697d2 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1889,17 +1889,6 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) /* Sequential rebuild must do IO at redundancy group boundary. */ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); - /* - * Calculate how much parity is available for sitting out reads - */ - int parity_avail = rr->rr_firstdatacol; - for (int p = 0; p < rr->rr_firstdatacol; p++) { - raidz_col_t *rc = &rr->rr_col[p]; - if (!vdev_draid_readable(vd->vdev_child[rc->rc_devidx], - rc->rc_offset)) { - parity_avail--; - } - } /* * Iterate over the columns in reverse order so that we hit the parity * last. Any errors along the way will force us to read the parity. @@ -2004,14 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } - } else if (parity_avail > 0 && c >= rr->rr_firstdatacol && - rr->rr_missingdata == 0 && - vdev_skip_latency_outlier(cvd, zio->io_flags)) { - rr->rr_missingdata++; - rc->rc_error = SET_ERROR(EAGAIN); - rc->rc_skipped = 1; - parity_avail--; - continue; + } else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_noutliers++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b905108e575c..b277a4380fa7 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -357,7 +357,7 @@ uint_t raidz_expand_pause_point = 0; /* * This represents the duration for a slow drive read sit out. */ -static unsigned long raid_read_sit_out_secs = 600; +static unsigned long raidz_read_sit_out_secs = 600; static hrtime_t raid_outlier_check_interval_ms = 20; @@ -2293,12 +2293,13 @@ vdev_raidz_min_asize(vdev_t *vd) * * In vdev_child_slow_outlier() it looks for outliers based on disk * latency from the most recent child reads. Here we're checking if, - * over time, a disk has has been an outlier too many times. + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. */ boolean_t vdev_skip_latency_outlier(vdev_t *vd, zio_flag_t io_flags) { - if (raid_read_sit_out_secs == 0) + if (raidz_read_sit_out_secs == 0) return (B_FALSE); /* Avoid skipping a data column read when resilvering */ @@ -2482,18 +2483,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; - /* - * Calculate how much parity is available for sitting out reads - */ - int parity_avail = rr->rr_firstdatacol; - for (int p = 0; p < rr->rr_firstdatacol; p++) { - raidz_col_t *rc = &rr->rr_col[p]; - if (rc->rc_size > 0 && - !vdev_readable(vd->vdev_child[rc->rc_devidx])) { - parity_avail--; - } - } - /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. @@ -2513,19 +2502,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } - /* - * Check if a data colummn read should be skipped - */ - if (parity_avail > 0 && - c >= rr->rr_firstdatacol && - rr->rr_missingdata == 0 && - vdev_skip_latency_outlier(cvd, zio->io_flags)) { - rr->rr_missingdata++; - rc->rc_error = SET_ERROR(EAGAIN); - rc->rc_skipped = 1; - parity_avail--; - continue; - } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; @@ -2535,6 +2511,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_noutliers++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2570,12 +2580,6 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) prc->rc_skipped = 1; continue; } - /* XXX is this a good place to be skipping reads? */ - if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { - prc->rc_error = SET_ERROR(EAGAIN); - prc->rc_skipped = 1; - continue; - } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, prc->rc_offset, prc->rc_abd, prc->rc_size, zio->io_type, zio->io_priority, 0, @@ -2856,7 +2860,7 @@ vdev_check_outlier_state(vdev_t *vd) * Begin a sit out period for this slow drive */ vd->vdev_read_sit_out_expire = gethrtime() + - SEC2NSEC(raid_read_sit_out_secs); + SEC2NSEC(raidz_read_sit_out_secs); /* count each slow io period */ mutex_enter(&vd->vdev_stat_lock); @@ -2865,8 +2869,8 @@ vdev_check_outlier_state(vdev_t *vd) (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, vd->vdev_spa, vd, NULL, NULL, 0); - zfs_dbgmsg("vd-%d begin read sit out for %d secs", - (int)vd->vdev_id, (int)raid_read_sit_out_secs); + vdev_dbgmsg(vd, "begin read sit out for %d secs", + (int)raidz_read_sit_out_secs); vdev_t *raidvd = vd->vdev_parent; for (int c = 0; c < raidvd->vdev_children; c++) { @@ -2919,22 +2923,18 @@ latency_quartiles_fence(const uint64_t *data, size_t n) return (fence); } -static void -latency_sort(uint64_t *samples, int size) { - /* Insertion sort but size is small */ - for (int i = 1; i < size; i++) { - uint64_t val = samples[i]; - int j = i; - while (j > 0 && samples[j - 1] > val) { - samples[j] = samples[j - 1]; - j--; - } - samples[j] = val; - } -} +#define LAT_SAMPLES_STACK 32 +#define LAT_SAMPLES_MIN 5 +#define LAT_OUTLIER_LIMIT 50 -#define STACK_SAMPLES 32 -#define MIN_LAT_SAMPLES 5 +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} /* * Check for any latency outlier from latest set of child reads. @@ -2944,11 +2944,11 @@ latency_sort(uint64_t *samples, int size) { * third quartile plus two times the Interquartile Range (IQR). This range * is the distance between the first and third quartile. */ -noinline static void +static void vdev_child_slow_outlier(zio_t *zio) { vdev_t *vd = zio->io_vd; - if (raid_read_sit_out_secs == 0 || vd->vdev_children < MIN_LAT_SAMPLES) + if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) return; spa_t *spa = zio->io_spa; @@ -2968,10 +2968,10 @@ vdev_child_slow_outlier(zio_t *zio) } int samples = vd->vdev_children; - uint64_t data[STACK_SAMPLES]; + uint64_t data[LAT_SAMPLES_STACK]; uint64_t *lat_data; - if (samples > STACK_SAMPLES) + if (samples > LAT_SAMPLES_STACK) lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP); else lat_data = &data[0]; @@ -3007,22 +3007,22 @@ vdev_child_slow_outlier(zio_t *zio) } } - latency_sort(lat_data, samples); + qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); uint64_t fence = latency_quartiles_fence(lat_data, samples); if (lat_data[samples - 1] > fence) { /* * Keep track of how many times this child has had * an outlier read. A disk that persitently has a - * higer than peers outlier count will be considered + * higher than peers outlier count will be considered * a slow disk. */ - if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > 50 && - svd == ovd) { + if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > + LAT_OUTLIER_LIMIT && svd == ovd) { vdev_check_outlier_state(svd); } } out: - if (samples > STACK_SAMPLES) + if (samples > LAT_SAMPLES_STACK) kmem_free(lat_data, sizeof (uint64_t) * samples); } @@ -3766,11 +3766,10 @@ vdev_raidz_io_done(zio_t *zio) for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); - - /* Periodically check for a read outlier */ - if (i == 0 && zio->io_type == ZIO_TYPE_READ) - vdev_child_slow_outlier(zio); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5406,6 +5405,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); -ZFS_MODULE_PARAM(zfs_vdev, raid_, read_sit_out_secs, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW, "Raidz/draid slow disk sit out time period in seconds"); /* END CSTYLED */