From 3965f734d1709d38b57436206c2d5895d4ee993b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 6 Jan 2025 21:26:14 +0000 Subject: [PATCH] Detect a slow raidz child during reads A single slow responding disk can affect the overall read performance of a raidz group. When a raidz child disk is determined to be a persistent slow outlier, then have it sit out during reads for a period of time. The raidz group can use parity to reconstruct the data that was skipped. Each time a slow disk is placed into a sit out period, its `vdev_stat.vs_slow_ios count` is incremented and a zevent class `ereport.fs.zfs.delay` is posted. The length of the sit out period can be changed using the `raid_read_sit_out_secs` module parameter. Setting it to zero disables slow outlier detection. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady --- include/sys/fs/zfs.h | 1 + include/sys/vdev_impl.h | 4 + include/sys/vdev_raidz.h | 1 + include/sys/vdev_raidz_impl.h | 2 + lib/libzfs/libzfs.abi | 3 +- lib/libzfs/libzfs_pool.c | 2 + man/man4/zfs.4 | 12 ++ man/man7/vdevprops.7 | 7 + module/zcommon/zpool_prop.c | 3 + module/zfs/vdev.c | 16 ++ module/zfs/vdev_draid.c | 23 +++ module/zfs/vdev_raidz.c | 285 ++++++++++++++++++++++++++++++++++ 12 files changed, 358 insertions(+), 1 deletion(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index dc474e3739f3..547cfb72d1c4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -379,6 +379,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT_READS, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..a9310f16fffb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -285,6 +285,7 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -432,6 +433,9 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + uint64_t vdev_outlier_count; /* read outlier amongst peers */ + uint64_t vdev_ewma_latency; /* moving average read latency */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 64f484e9aa13..96779b65058b 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -60,6 +60,7 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_skip_latency_outlier(vdev_t *, zio_flag_t); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..07f8e560c747 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -118,6 +118,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -132,6 +133,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1f9fde6677d8..f2aef8754460 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5917,7 +5917,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 64f9d1f6eb49..ec30d2a6ddd3 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5478,6 +5478,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); + if (prop == VDEV_PROP_SIT_OUT_READS) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7078a5ba8373..c61303d28c91 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -501,6 +501,18 @@ For testing, pause RAID-Z expansion when reflow amount reaches this value. .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong For expanded RAID-Z, aggregate reads that have more rows than this. . +.It Sy raidz_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Resilver and scrub operations will always read from a disk, even if it's +sitting out. +Only a single disk in a RAID-Z or dRAID vdev may sit out at the same time. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. +Defaults to 600 seconds and a value of zero disables slow disk outlier +detection. +. .It Sy reference_history Ns = Ns Sy 3 Pq int Maximum reference holders being tracked when reference_tracking_enable is active. diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 34d4026b1009..844864518c1e 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -104,12 +104,19 @@ Comma separated list of children of this vdev The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev +.It Sy sit_out_reads +True when a slow disk outlier was detected and the vdev is currently in a sit +out state. +While sitting out, the vdev will not participate in normal reads, instead its +data will be reconstructed as needed from parity. .It Sy slow_ios The number of slow I/Os encountered by this vdev, These represent I/O operations that didn't complete in .Sy zio_slow_io_ms milliseconds .Pq Sy 30000 No by default . +Can also be incremented when a vdev was determined to be a raidz leaf latency +outlier. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index ea9eda4b316d..461fc7faefe2 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,6 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS", + boolean_table, sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5df2f77e5780..a6eacd3343f4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4521,6 +4521,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6361,6 +6363,20 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT_READS: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, + vd->vdev_read_sit_out_expire != 0, + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 419c8ac5bb28..23dca5a73f71 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1993,6 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } + } else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6103f780e6bc..322b384994a1 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -354,6 +354,13 @@ unsigned long raidz_expand_max_reflow_bytes = 0; */ uint_t raidz_expand_pause_point = 0; +/* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long raidz_read_sit_out_secs = 600; + +static hrtime_t raid_outlier_check_interval_ms = 20; + /* * Maximum amount of copy io's outstanding at once. */ @@ -2281,6 +2288,45 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_skip_latency_outlier(vdev_t *vd, zio_flag_t io_flags) +{ + if (raidz_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when resilvering */ + if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + return (B_FALSE); + + return (vd->vdev_read_sit_out_expire != 0); +} + +/* + * Calculate the Exponential Weighted Moving Average (EWMA) + * where + * alpha: the smoothing factor -- represented here as a scaled integer + * scale: the number of bits used to scale alpha + */ +static uint64_t +calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) { + /* + * Scale using 16 bits with an effective alpha of 0.50 + */ + const uint64_t scale = 16; + const uint64_t alpha = 32768; + + return (((alpha * latest_value) + (((1ULL << scale) - alpha) * + previous_ewma)) >> scale); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2290,6 +2336,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; + + /* + * Process the disk io latency before it goes out of scope. + * + * A zio->io_delay value of zero means this IO was part of + * an aggregation. + */ + if (zio->io_type == ZIO_TYPE_READ && zio->io_error == 0 && + zio->io_size > 0 && zio->io_delay != 0) { + vdev_t *vd = zio->io_vd; + uint64_t previous_ewma = atomic_load_64(&vd->vdev_ewma_latency); + if (previous_ewma == 0) + previous_ewma = zio->io_delay; + + atomic_store_64(&vd->vdev_ewma_latency, + calculate_ewma(previous_ewma, zio->io_delay)); + } } static void @@ -2445,6 +2508,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2468,6 +2565,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2744,6 +2842,187 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +static void +vdev_check_outlier_state(vdev_t *vd) +{ + /* + * A slow vdev child can be in one of four states. + * 1. monitoring for outlier classification + * 2. determined to be an outlier and begin a sit out period + * 3. inside a sit out period + * 4. finished sit out period and back to monitoring state + */ + if (vd->vdev_read_sit_out_expire == 0) { + /* + * Begin a sit out period for this slow drive + */ + vd->vdev_read_sit_out_expire = gethrtime() + + SEC2NSEC(raidz_read_sit_out_secs); + + /* count each slow io period */ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_slow_ios++; + mutex_exit(&vd->vdev_stat_lock); + + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, vd->vdev_spa, + vd, NULL, NULL, 0); + vdev_dbgmsg(vd, "begin read sit out for %d secs", + (int)raidz_read_sit_out_secs); + + vdev_t *raidvd = vd->vdev_parent; + for (int c = 0; c < raidvd->vdev_children; c++) { + atomic_store_64( + &raidvd->vdev_child[c]->vdev_outlier_count, 0); + atomic_store_64( + &raidvd->vdev_child[c]->vdev_ewma_latency, 0); + } + } else if (vd->vdev_read_sit_out_expire < gethrtime()) { + /* Done with our sit out -- wait for new outlier to emerge */ + vd->vdev_read_sit_out_expire = 0; + } +} + +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n>>1) - 1] + data[n>>1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + 2 x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n) +{ + uint64_t q1, q3; + + q1 = latency_median_value(&data[0], n>>1); + if (n % 2 == 0) + q3 = latency_median_value(&data[n>>1], n>>1); + else + q3 = latency_median_value(&data[(n+1) >> 1], n>>1); + + uint64_t iqr = q3 - q1; + uint64_t fence = q3 + iqr; + + return (fence); +} + +#define LAT_SAMPLES_STACK 64 +#define LAT_SAMPLES_MIN 5 +#define LAT_OUTLIER_LIMIT 50 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 2, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus two times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) + return; + + spa_t *spa = zio->io_spa; + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER || + (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed)) { + return; + } + + hrtime_t now = gethrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(raid_outlier_check_interval_ms) || + atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) { + return; + } + + int samples = vd->vdev_children; + uint64_t data[LAT_SAMPLES_STACK]; + uint64_t *lat_data; + + if (samples > LAT_SAMPLES_STACK) + lat_data = kmem_alloc(sizeof (uint64_t) * samples, KM_SLEEP); + else + lat_data = &data[0]; + + uint64_t max = 0; + uint64_t max_outier_count = 0; + vdev_t *svd = NULL; /* suspect vdev */ + vdev_t *ovd = NULL; /* largest outlier vdev */ + for (int c = 0; c < samples; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_read_sit_out_expire > 0) { + atomic_store_64(&cvd->vdev_ewma_latency, 0); + goto out; + } + + lat_data[c] = atomic_load_64(&cvd->vdev_ewma_latency); + + /* wait until all disks have been read from */ + if (lat_data[c] == 0) + goto out; + + /* keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + } + + uint64_t count = atomic_load_64(&cvd->vdev_outlier_count); + if (count > max_outier_count) { + max_outier_count = count; + ovd = cvd; + } + } + + qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); + uint64_t fence = latency_quartiles_fence(lat_data, samples); + if (lat_data[samples - 1] > fence) { + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > + LAT_OUTLIER_LIMIT && svd == ovd) { + vdev_check_outlier_state(svd); + } + } +out: + if (samples > LAT_SAMPLES_STACK) + kmem_free(lat_data, sizeof (uint64_t) * samples); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3485,6 +3764,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5120,3 +5402,6 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +/* END CSTYLED */