Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect a slow raidz child during reads #16900

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SIT_OUT_READS,
VDEV_NUM_PROPS
} vdev_prop_t;

Expand Down
4 changes: 4 additions & 0 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ struct vdev {
boolean_t vdev_ishole; /* is a hole in the namespace */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
uint64_t vdev_last_latency_check;

/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
Expand Down Expand Up @@ -432,6 +433,9 @@ struct vdev {
hrtime_t vdev_mmp_pending; /* 0 if write finished */
uint64_t vdev_mmp_kstat_id; /* to find kstat entry */
uint64_t vdev_expansion_time; /* vdev's last expansion time */
uint64_t vdev_outlier_count; /* read outlier amongst peers */
uint64_t vdev_ewma_latency; /* moving average read latency */
hrtime_t vdev_read_sit_out_expire; /* end of sit out period */
list_node_t vdev_leaf_node; /* leaf vdev list */

/*
Expand Down
1 change: 1 addition & 0 deletions include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *);
struct raidz_row *vdev_raidz_row_alloc(int, zio_t *);
void vdev_raidz_reflow_copy_scratch(spa_t *);
void raidz_dtl_reassessed(vdev_t *);
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);

extern const zio_vsd_ops_t vdev_raidz_vsd_ops;

Expand Down
2 changes: 2 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ typedef struct raidz_col {
uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */
uint8_t rc_force_repair:1; /* Write good data to this column */
uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */
uint8_t rc_latency_outlier:1; /* Latency outlier for this device */
int rc_shadow_devidx; /* for double write during expansion */
int rc_shadow_error; /* for double write during expansion */
uint64_t rc_shadow_offset; /* for double write during expansion */
Expand All @@ -132,6 +133,7 @@ typedef struct raidz_row {
int rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_outlier_cnt; /* Count of latency outlier devices */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
Expand Down
3 changes: 2 additions & 1 deletion lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -5917,7 +5917,8 @@
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
<enumerator name='VDEV_NUM_PROPS' value='52'/>
<enumerator name='VDEV_PROP_SIT_OUT_READS' value='52'/>
<enumerator name='VDEV_NUM_PROPS' value='53'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>
Expand Down
2 changes: 2 additions & 0 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
if (prop == VDEV_PROP_SIT_OUT_READS)
return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
(const char **)&strval) != 0)
Expand Down
12 changes: 12 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,18 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value.
.It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
For expanded RAID-Z, aggregate reads that have more rows than this.
.
.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong
Copy link
Contributor

@tonyhutter tonyhutter Jan 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All references to 'raidz' should be renamed to 'raid', since this works with dRAID (and to match the raid_read_sit_out_secs name in the commit message)

Copy link
Contributor

@behlendorf behlendorf Jan 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was originally, I asked Don to rename it "raidz" in this #16900 (comment) so it's consistent with some other existing names. Perhaps the best thing to do would be rename it read_sit_out_secs. Then we could also generically apply it to mirrors if someday support is added for that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or even, sit_out_secs to be consistent with the other renames.

When a slow disk outlier is detected it is placed in a sit out state.
While sitting out the disk will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
Resilver and scrub operations will always read from a disk, even if it's
sitting out.
Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time.
Writes will still be issued to a disk which is sitting out to maintain full
redundancy.
Defaults to 600 seconds and a value of zero disables slow disk outlier
detection.
.
.It Sy reference_history Ns = Ns Sy 3 Pq int
Maximum reference holders being tracked when reference_tracking_enable is
active.
Expand Down
7 changes: 7 additions & 0 deletions man/man7/vdevprops.7
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,19 @@ Comma separated list of children of this vdev
The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
.It Sy sit_out_reads
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rename it to just sit_out here and in other places to be brief. We could use it for other things too. For writes we now have ability to mark top-level vdevs as non-allocating in some cases, but may be we could converge those two at some point.

True when a slow disk outlier was detected and the vdev is currently in a sit
out state.
While sitting out, the vdev will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
.It Sy slow_ios
The number of slow I/Os encountered by this vdev,
These represent I/O operations that didn't complete in
.Sy zio_slow_io_ms
milliseconds
.Pq Sy 30000 No by default .
Can also be incremented when a vdev was determined to be a raidz leaf latency
outlier.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand what you mean by this. Who would be incrementing zio_slow_io_ms?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about this...

     slow_ios       This indicates the number of slow I/O operations encountered
                    by this vdev.  A slow I/O is defined as an operation that
                    did not complete within the zio_slow_io_ms threshold in mil‐
                    liseconds (30000 by default).  For RAIDZ and DRAID configu‐
                    rations, this value also represents the number of times the
                    vdev was identified as an outlier and excluded from partici‐
                    pating in read I/O operations.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
Expand Down
3 changes: 3 additions & 0 deletions module/zcommon/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,9 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
Expand Down
15 changes: 15 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
atomic_store_64(&vd->vdev_outlier_count, 0);
vd->vdev_read_sit_out_expire = 0;

for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
Expand Down Expand Up @@ -6361,6 +6363,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_SIT_OUT_READS:
/* Only expose this for a draid or raidz leaf */
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_top != NULL &&
(vd->vdev_top->vdev_ops ==
&vdev_raidz_ops ||
vd->vdev_top->vdev_ops ==
&vdev_draid_ops)) {
vdev_prop_add_list(outnvl, propname,
NULL, vdev_sit_out_reads(vd, 0),
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_TRIM_SUPPORT:
/* only valid for leaf vdevs */
if (vd->vdev_ops->vdev_op_leaf) {
Expand Down
23 changes: 23 additions & 0 deletions module/zfs/vdev_draid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1993,6 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_force_repair = 1;
rc->rc_allow_repair = 1;
}
} else if (vdev_sit_out_reads(cvd, zio->io_flags)) {
rr->rr_outlier_cnt++;
rc->rc_latency_outlier = 1;
}
}

/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To skip vdevs which are sitting out during a resilver we need to modify this check slightly. Each resilvering drive will add one to rr->rr_missingdata and if we're going to sit out one more drive there needs to be sufficient parity available. This updated code should do the trick. You'll need to update the raidz similarly.

Can you also add versions of tests/functional/replacement/attach_resilver.ksh, tests/functional/replacement/replace_resilver.ksh which sit out a drive during the resilver. We'll want to test raidz[1-3] and draid[1-3].

        /*
         * When the row contains a latency outlier and sufficient parity
         * exists to reconstruct the column data, then skip reading the
         * known slow child vdev as a performance optimization.
         */
        if (rr->rr_outlier_cnt > 0) {
                int available_parity = rr->rr_firstdatacol - rr->rr_missingparity;
                int required_parity = rr->rr_missingdata + 1;

                if (available_parity >= required_parity) {
                        for (int c = rr->rr_cols - 1;
                             c >= rr->rr_firstdatacol; c--) {
                                raidz_col_t *rc = &rr->rr_col[c];

                                if (rc->rc_error == 0 &&
                                    rc->rc_latency_outlier) {
                                        rr->rr_missingdata++;
                                        rc->rc_error = SET_ERROR(EAGAIN);
                                        rc->rc_skipped = 1;
                                        break;
                                }
                        }
                }
        }

* When the row contains a latency outlier and sufficient parity
* exists to reconstruct the column data, then skip reading the
* known slow child vdev as a performance optimization.
*/
if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 &&
(rr->rr_firstdatacol - rr->rr_missingparity) > 0) {

for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_latency_outlier) {
rr->rr_missingdata++;
rc->rc_error = SET_ERROR(EAGAIN);
rc->rc_skipped = 1;
break;
}
}
}

Expand Down
Loading
Loading