Skip to content

Commit

Permalink
Latest set of feedback changes (from Brian)
Browse files Browse the repository at this point in the history
Signed-off-by: Don Brady <[email protected]>
  • Loading branch information
don-brady committed Jan 4, 2025
1 parent 00e4044 commit cff27ce
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 90 deletions.
2 changes: 2 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
Expand All @@ -132,6 +133,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_noutliers; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
Expand Down
14 changes: 9 additions & 5 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -501,11 +501,15 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value.
.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
For expanded RAID-Z, aggregate reads that have more rows than this.
.
.It Sy raid_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
For RAID-Z and dRAID only, this is the slow disk sit out time period in
seconds.
When a slow disk outlier is detected, then it gets placed in a sit out
during reads for the duration of this time period.
.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
When a slow disk outlier is detected it is placed in a sit out state.
While sitting out the disk will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
Resilver and scrub operations will always read from a disk, even if it's
sitting out.
Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
Writes will still be issued to a disk which is sitting out to maintain full
redundancy.
Defaults to 600 seconds and a value of zero disables slow disk outlier
detection.
.
Expand Down
42 changes: 23 additions & 19 deletions module/zfs/vdev_draid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1889,17 +1889,6 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
/* Sequential rebuild must do IO at redundancy group boundary. */
IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);

/*
* Calculate how much parity is available for sitting out reads
*/
int parity_avail = rr->rr_firstdatacol;
for (int p = 0; p < rr->rr_firstdatacol; p++) {
raidz_col_t *rc = &rr->rr_col[p];
if (!vdev_draid_readable(vd->vdev_child[rc->rc_devidx],
rc->rc_offset)) {
parity_avail--;
}
}
/*
* Iterate over the columns in reverse order so that we hit the parity
* last. Any errors along the way will force us to read the parity.
Expand Down Expand Up @@ -2004,14 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
}
} else if (parity_avail > 0 && c >= rr->rr_firstdatacol &&
rr->rr_missingdata == 0 &&
vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
parity_avail--;
continue;
} else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_noutliers++;
rc->rc_latency_outlier = 1;
}
}

/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {

for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}

Expand Down
131 changes: 65 additions & 66 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ uint_t raidz_expand_pause_point = 0;
/*
* This represents the duration for a slow drive read sit out.
*/
static unsigned long raid_read_sit_out_secs = 600;
static unsigned long raidz_read_sit_out_secs = 600;

static hrtime_t raid_outlier_check_interval_ms = 20;

Expand Down Expand Up @@ -2293,12 +2293,13 @@ vdev_raidz_min_asize(vdev_t *vd)
*
* In vdev_child_slow_outlier() it looks for outliers based on disk
* latency from the most recent child reads. Here we're checking if,
* over time, a disk has has been an outlier too many times.
* over time, a disk has has been an outlier too many times and is
* now in a sit out period.
*/
boolean_t
vdev_skip_latency_outlier(vdev_t *vd, zio_flag_t io_flags)
{
if (raid_read_sit_out_secs == 0)
if (raidz_read_sit_out_secs == 0)
return (B_FALSE);

/* Avoid skipping a data column read when resilvering */
Expand Down Expand Up @@ -2482,18 +2483,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
{
vdev_t *vd = zio->io_vd;

/*
* Calculate how much parity is available for sitting out reads
*/
int parity_avail = rr->rr_firstdatacol;
for (int p = 0; p < rr->rr_firstdatacol; p++) {
raidz_col_t *rc = &rr->rr_col[p];
if (rc->rc_size > 0 &&
!vdev_readable(vd->vdev_child[rc->rc_devidx])) {
parity_avail--;
}
}

/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
Expand All @@ -2513,19 +2502,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
/*
* Check if a data colummn read should be skipped
*/
if (parity_avail > 0 &&
c >= rr->rr_firstdatacol &&
rr->rr_missingdata == 0 &&
vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
parity_avail--;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rr->rr_firstdatacol)
rr->rr_missingdata++;
Expand All @@ -2535,6 +2511,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}

if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
rr->rr_noutliers++;
rc->rc_latency_outlier = 1;
}
}

/*
* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {

for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}

for (int c = rr->rr_cols - 1; c >= 0; c--) {
raidz_col_t *rc = &rr->rr_col[c];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

if (rc->rc_error || rc->rc_size == 0)
continue;

if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
Expand Down Expand Up @@ -2570,12 +2580,6 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
prc->rc_skipped = 1;
continue;
}
/* XXX is this a good place to be skipping reads? */
if (vdev_skip_latency_outlier(cvd, zio->io_flags)) {
prc->rc_error = SET_ERROR(EAGAIN);
prc->rc_skipped = 1;
continue;
}
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
prc->rc_offset, prc->rc_abd, prc->rc_size,
zio->io_type, zio->io_priority, 0,
Expand Down Expand Up @@ -2856,7 +2860,7 @@ vdev_check_outlier_state(vdev_t *vd)
* Begin a sit out period for this slow drive
*/
vd->vdev_read_sit_out_expire = gethrtime() +
SEC2NSEC(raid_read_sit_out_secs);
SEC2NSEC(raidz_read_sit_out_secs);

/* count each slow io period */
mutex_enter(&vd->vdev_stat_lock);
Expand All @@ -2865,8 +2869,8 @@ vdev_check_outlier_state(vdev_t *vd)

(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, vd->vdev_spa,
vd, NULL, NULL, 0);
zfs_dbgmsg("vd-%d begin read sit out for %d secs",
(int)vd->vdev_id, (int)raid_read_sit_out_secs);
vdev_dbgmsg(vd, "begin read sit out for %d secs",
(int)raidz_read_sit_out_secs);

vdev_t *raidvd = vd->vdev_parent;
for (int c = 0; c < raidvd->vdev_children; c++) {
Expand Down Expand Up @@ -2919,22 +2923,18 @@ latency_quartiles_fence(const uint64_t *data, size_t n)
return (fence);
}

static void
latency_sort(uint64_t *samples, int size) {
/* Insertion sort but size is small */
for (int i = 1; i < size; i++) {
uint64_t val = samples[i];
int j = i;
while (j > 0 && samples[j - 1] > val) {
samples[j] = samples[j - 1];
j--;
}
samples[j] = val;
}
}
#define LAT_SAMPLES_STACK 32
#define LAT_SAMPLES_MIN 5
#define LAT_OUTLIER_LIMIT 50

#define STACK_SAMPLES 32
#define MIN_LAT_SAMPLES 5
static int
latency_compare(const void *arg1, const void *arg2)
{
const uint64_t *l1 = (uint64_t *)arg1;
const uint64_t *l2 = (uint64_t *)arg2;

return (TREE_CMP(*l1, *l2));
}

/*
* Check for any latency outlier from latest set of child reads.
Expand All @@ -2944,11 +2944,11 @@ latency_sort(uint64_t *samples, int size) {
* third quartile plus two times the Interquartile Range (IQR). This range
* is the distance between the first and third quartile.
*/
noinline static void
static void
vdev_child_slow_outlier(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (raid_read_sit_out_secs == 0 || vd->vdev_children < MIN_LAT_SAMPLES)
if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN)
return;

spa_t *spa = zio->io_spa;
Expand All @@ -2968,10 +2968,10 @@ vdev_child_slow_outlier(zio_t *zio)
}

int samples = vd->vdev_children;
uint64_t data[STACK_SAMPLES];
uint64_t data[LAT_SAMPLES_STACK];
uint64_t *lat_data;

if (samples > STACK_SAMPLES)
if (samples > LAT_SAMPLES_STACK)
lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP);
else
lat_data = &data[0];
Expand Down Expand Up @@ -3007,22 +3007,22 @@ vdev_child_slow_outlier(zio_t *zio)
}
}

latency_sort(lat_data, samples);
qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
uint64_t fence = latency_quartiles_fence(lat_data, samples);
if (lat_data[samples - 1] > fence) {
/*
* Keep track of how many times this child has had
* an outlier read. A disk that persitently has a
* higer than peers outlier count will be considered
* higher than peers outlier count will be considered
* a slow disk.
*/
if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > 50 &&
svd == ovd) {
if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) >
LAT_OUTLIER_LIMIT && svd == ovd) {
vdev_check_outlier_state(svd);
}
}
out:
if (samples > STACK_SAMPLES)
if (samples > LAT_SAMPLES_STACK)
kmem_free(lat_data, sizeof (uint64_t) * samples);
}

Expand Down Expand Up @@ -3766,11 +3766,10 @@ vdev_raidz_io_done(zio_t *zio)
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);

/* Periodically check for a read outlier */
if (i == 0 && zio->io_type == ZIO_TYPE_READ)
vdev_child_slow_outlier(zio);
}
/* Periodically check for a read outlier */
if (zio->io_type == ZIO_TYPE_READ)
vdev_child_slow_outlier(zio);
zio_checksum_verified(zio);
} else {
/*
Expand Down Expand Up @@ -5406,6 +5405,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
ZFS_MODULE_PARAM(zfs_vdev, raid_, read_sit_out_secs, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW,
"Raidz/draid slow disk sit out time period in seconds");
/* END CSTYLED */

0 comments on commit cff27ce

Please sign in to comment.