Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16111 rebuild: enhance leader update_and_warn_for_slow_engines() #15778

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/rebuild/rebuild_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2017-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -103,6 +104,7 @@ struct rebuild_tgt_pool_tracker {

struct rebuild_server_status {
d_rank_t rank;
double last_update;
uint32_t dtx_resync_version;
uint32_t scan_done:1,
pull_done:1;
Expand All @@ -122,8 +124,11 @@ struct rebuild_global_pool_tracker {

/** rebuild status for each server */
struct rebuild_server_status *rgt_servers;
double *rgt_servers_last_update;
double rgt_last_warn;

/** timestamps for the global operation */
double rgt_last_warn_ts; /* time of most recent warning log for "slow engines" */
double rgt_scan_warn_deadline_ts; /* time after which to warn that scan may be hung */
double rgt_pull_warn_deadline_ts; /* time after which to warn that pull may be hung */

/** indirect indices for binary search by rank */
struct rebuild_server_status **rgt_servers_sorted;
Expand Down
164 changes: 121 additions & 43 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,23 @@ is_rebuild_global_done(struct rebuild_global_pool_tracker *rgt)

}

/* determine if "most" engines are done with their current rebuild phase (scan or pull) */
static bool
is_rebuild_phase_mostly_done(int engines_done_ct, int engines_total_ct)
{
const int MIN_WAIT_CT = 1;
const int MAX_WAIT_CT = 20;
int wait_ct_threshold;
int engines_waiting_ct = engines_total_ct - engines_done_ct;

/* When is the global operation "mostly done" (e.g., waiting for 20 or fewer engines)? */
wait_ct_threshold = engines_done_ct - .95 * engines_total_ct;
wait_ct_threshold = max(MIN_WAIT_CT, wait_ct_threshold);
wait_ct_threshold = min(MAX_WAIT_CT, wait_ct_threshold);

return (engines_waiting_ct <= wait_ct_threshold);
}

#define SCAN_DONE 0x1
#define PULL_DONE 0x2

Expand Down Expand Up @@ -246,7 +263,7 @@ rebuild_leader_set_update_time(struct rebuild_global_pool_tracker *rgt, d_rank_t
i = daos_array_find(rgt->rgt_servers_sorted, rgt->rgt_servers_number, rank,
&servers_sort_ops);
if (i >= 0) {
rgt->rgt_servers_last_update[i] = ABT_get_wtime();
rgt->rgt_servers[i].last_update = ABT_get_wtime();
return;
}
D_INFO("rank %u is not included in this rebuild.\n", rank);
Expand Down Expand Up @@ -320,6 +337,101 @@ rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen)
return found;
}

static void
update_and_warn_for_slow_engines(struct rebuild_global_pool_tracker *rgt)
{
int i;
int scan_ct = 0;
int pull_ct = 0;
int done_ct;
int wait_ct;
bool scan_gl = false;
bool pull_gl = false;
bool do_warn = false;
double now = ABT_get_wtime();
double tw = now - rgt->rgt_last_warn_ts; /* time since last warning logged */
bool warned = false;

/* Throttle warnings to not more often than once per 2 minutes */
do_warn = (tw >= 120);

/* Count scan/pull progress and warn for any ranks that haven't provided updates recently */
for (i = 0; i < rgt->rgt_servers_number; i++) {
double tu = now - rgt->rgt_servers[i].last_update;
d_rank_t r = rgt->rgt_servers[i].rank;

if (rgt->rgt_servers[i].scan_done) {
scan_ct++;
if (rgt->rgt_servers[i].pull_done) {
pull_ct++;
continue;
}
}

if (!do_warn)
continue;

if (tu > 30) {
D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. "
"scan_done=%d pull_done=%d\n",
DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done,
rgt->rgt_servers[i].pull_done);
warned = true;
}
}

scan_gl = (scan_ct == rgt->rgt_servers_number);
pull_gl = (pull_ct == rgt->rgt_servers_number);
if (scan_gl && pull_gl)
return;

/* Determine if scan/pull progress is almost done; mark the time and warn on possible stall.
*/
done_ct = scan_gl ? pull_ct : scan_ct;
wait_ct = rgt->rgt_servers_number - done_ct;
#if 0
D_DEBUG(DB_TRACE, DF_RB ": s_done=%s, s_ct=%d, p_done=%s, p_ct=%d, servers=%d, d_ct=%d, "
" almost=%s\n", DP_RB_RGT(rgt), scan_gl ? "yes" : "no", scan_ct, pull_gl ? "yes" : "no",
pull_ct, rgt->rgt_servers_number, done_ct,
is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number) ? "yes" : "no");
#endif
if (is_rebuild_phase_mostly_done(done_ct, rgt->rgt_servers_number)) {
if (!scan_gl && rgt->rgt_scan_warn_deadline_ts == 0.0) {
rgt->rgt_scan_warn_deadline_ts = now + 120.0;
D_DEBUG(DB_REBUILD, DF_RB ": scan almost done, %d/%d engines\n",
DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number);
} else if (!pull_gl && rgt->rgt_pull_warn_deadline_ts == 0.0) {
rgt->rgt_pull_warn_deadline_ts = now + 120.0;
D_DEBUG(DB_REBUILD, DF_RB ": pull almost done, %d/%d engines\n",
DP_RB_RGT(rgt), done_ct, rgt->rgt_servers_number);
}

if (!do_warn)
return;

if (!scan_gl && now > rgt->rgt_scan_warn_deadline_ts) {
D_WARN(DF_RB ": scan hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt),
wait_ct, rgt->rgt_servers_number);
for (i = 0; i < rgt->rgt_servers_number; i++)
if (!rgt->rgt_servers[i].scan_done)
D_WARN(DF_RB ": rank %u not finished scanning!\n",
DP_RB_RGT(rgt), rgt->rgt_servers[i].rank);
warned = true;
} else if (!pull_gl && now > rgt->rgt_pull_warn_deadline_ts) {
D_WARN(DF_RB ": pull hung? waiting for %d/%d engines:\n", DP_RB_RGT(rgt),
wait_ct, rgt->rgt_servers_number);
for (i = 0; i < rgt->rgt_servers_number; i++)
if (!rgt->rgt_servers[i].pull_done)
D_WARN(DF_RB ": rank %u not finished pulling!\n",
DP_RB_RGT(rgt), rgt->rgt_servers[i].rank);
warned = true;
}

if (warned)
rgt->rgt_last_warn_ts = now;
}
}

int
rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
struct rebuild_iv *iv)
Expand All @@ -346,8 +458,10 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
* both scan and pull status to the leader, i.e. they
* both can be trusted.
*/
if (iv->riv_status == 0)
if (iv->riv_status == 0) {
/* test only: update_and_warn_for_slow_engines(rgt); */
return 0;
}
}

/* Only trust pull done if scan is done globally */
Expand All @@ -358,6 +472,8 @@ rebuild_global_status_update(struct rebuild_global_pool_tracker *rgt,
rgt->rgt_rebuild_ver, iv->riv_rank);
}

/* test only: update_and_warn_for_slow_engines(rgt); */

return 0;
}

Expand Down Expand Up @@ -703,39 +819,6 @@ enum {
RB_BCAST_QUERY,
};

static void
warn_for_slow_engine_updates(struct rebuild_global_pool_tracker *rgt)
{
int i;
double now = ABT_get_wtime();
double tw = now - rgt->rgt_last_warn; /* time since last warning logged */
bool warned = false;

/* Throttle warnings to not more often than once per 2 minutes */
if (tw < 120)
return;

/* Warn for ranks not done and that haven't provided updates in a while (> 30 sec) */
for (i = 0; i < rgt->rgt_servers_number; i++) {
double tu = now - rgt->rgt_servers_last_update[i];
d_rank_t r = rgt->rgt_servers[i].rank;

if (rgt->rgt_servers[i].scan_done && rgt->rgt_servers[i].pull_done)
continue;

if (tu > 30) {
D_WARN(DF_RB ": no updates from rank %u in %8.3f seconds. "
"scan_done=%d pull_done=%d\n",
DP_RB_RGT(rgt), r, tu, rgt->rgt_servers[i].scan_done,
rgt->rgt_servers[i].pull_done);
warned = true;
}
}

if (warned)
rgt->rgt_last_warn = now;
}

/*
* Check rebuild status on the leader. Every other target sends
* its own rebuild status by IV.
Expand Down Expand Up @@ -865,7 +948,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
D_PRINT("%s", sbuf);
}
sleep:
warn_for_slow_engine_updates(rgt);
update_and_warn_for_slow_engines(rgt);
sched_req_sleep(rgt->rgt_ult, RBLD_CHECK_INTV);
}

Expand All @@ -882,8 +965,6 @@ rebuild_global_pool_tracker_destroy(struct rebuild_global_pool_tracker *rgt)
D_FREE(rgt->rgt_servers);
if (rgt->rgt_servers_sorted)
D_FREE(rgt->rgt_servers_sorted);
if (rgt->rgt_servers_last_update)
D_FREE(rgt->rgt_servers_last_update);

if (rgt->rgt_lock)
ABT_mutex_free(&rgt->rgt_lock);
Expand Down Expand Up @@ -921,16 +1002,13 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t
D_ALLOC_ARRAY(rgt->rgt_servers_sorted, rank_nr);
if (rgt->rgt_servers_sorted == NULL)
D_GOTO(out, rc = -DER_NOMEM);
D_ALLOC_ARRAY(rgt->rgt_servers_last_update, rank_nr);
if (rgt->rgt_servers_last_update == NULL)
D_GOTO(out, rc = -DER_NOMEM);

now = ABT_get_wtime();
rgt->rgt_last_warn = now;
rgt->rgt_last_warn_ts = now;
for (i = 0; i < rank_nr; i++) {
rgt->rgt_servers_sorted[i] = &rgt->rgt_servers[i];
rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank;
rgt->rgt_servers_last_update[i] = now;
rgt->rgt_servers[i].last_update = now;
}
rgt->rgt_servers_number = rank_nr;

Expand Down
17 changes: 17 additions & 0 deletions src/tests/ftest/daos_test/rebuild.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2018-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand Down Expand Up @@ -365,3 +366,19 @@ def test_rebuild_35(self):
:avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_35
"""
self.run_subtest()

def test_rebuild_36(self):
"""Jira ID: DAOS-16111

Test Description:
Run daos_test -r -s5 -u subtests=36

Use cases:
Core tests for daos_test rebuild

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=unittest,rebuild
:avocado: tags=DaosCoreTestRebuild,daos_test,daos_core_test_rebuild,test_rebuild_36
"""
self.run_subtest()
4 changes: 4 additions & 0 deletions src/tests/ftest/daos_test/rebuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ timeouts:
test_rebuild_33: 200
test_rebuild_34: 200
test_rebuild_35: 180
test_rebuild_36: 250
pool:
nvme_size: 0G

Expand Down Expand Up @@ -81,6 +82,7 @@ daos_tests:
test_rebuild_33: DAOS_Rebuild_33
test_rebuild_34: DAOS_Rebuild_34
test_rebuild_35: DAOS_Rebuild_35
test_rebuild_36: DAOS_Rebuild_36
daos_test:
test_rebuild_0to10: r
test_rebuild_12to15: r
Expand All @@ -104,6 +106,7 @@ daos_tests:
test_rebuild_33: r
test_rebuild_34: r
test_rebuild_35: r
test_rebuild_36: r
args:
test_rebuild_0to10: -s3 -u subtests="0-10"
test_rebuild_12to15: -s3 -u subtests="12-15"
Expand All @@ -127,6 +130,7 @@ daos_tests:
test_rebuild_33: -s5 -u subtests="33"
test_rebuild_34: -s5 -u subtests="34"
test_rebuild_35: -s5 -u subtests="35"
test_rebuild_36: -s5 -u subtests="36"
stopped_ranks:
test_rebuild_26: ["random"]
test_rebuild_27: ["random"]
Expand Down
Loading
Loading