Skip to content

Commit

Permalink
DAOS-14011 rebuild: incremental reintegration (#15671)
Browse files Browse the repository at this point in the history
* DAOS-14011 rebuild: incremental reintegration

incremental reintegration, set pool property "reintegration:incremental" to
enable it, disabled by default.

dmg pool set-prop pool_label "reintegration:incremental", or
dmg pool create pool_label--properties="reintegration:incremental"

Some internals: 
- Maintain global stable epoch through IV, it is the globally minimal
  stable epoch of all engines' all targets for the container.
- Reintegrate will rebuild from that stable epoch.
- Some containers/objects may be destroyed between exclude and
   reint, so during reintegration should try to discard those containers
   or objects. Add reint post process for that purpose.

Signed-off-by: Xuezhao Liu <[email protected]>
  • Loading branch information
liuxuezhao authored Jan 23, 2025
1 parent d38cd11 commit a3d4d2f
Show file tree
Hide file tree
Showing 27 changed files with 1,061 additions and 440 deletions.
10 changes: 6 additions & 4 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -1053,10 +1053,12 @@ automatically adjusted.

#### Reintegration mode (reintegration)

This property controls how reintegration will recover data. Two options are supported:
"data_sync" (default strategy) and "no_data_sync". with "data_sync", reintegration will
discard pool data and trigger rebuild to sync data. While with "no_data_sync", reintegration
only updates pool map to include rank.
This property controls how reintegration will recover data. Three options are supported:
"data_sync" (default strategy) and "no_data_sync", "incremental". with "data_sync", reintegration
will discard pool data and trigger rebuild to sync data. With "no_data_sync", reintegration only
updates pool map to include rank. While with "incremental", reintegration will not discard pool
data but will trigger rebuild to sync data only beyond global stable epoch, the reintegration is
incremental as old data below global stable epoch need not to be migrated.

NB: with "no_data_sync" enabled, containers will be turned to read-only, daos won't trigger
rebuild to restore the pool data redundancy on the surviving storage engines if there are
Expand Down
8 changes: 8 additions & 0 deletions src/common/lru.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -208,12 +209,14 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
{
struct daos_llink *llink;
d_list_t *link;
bool retried = false;
int rc = 0;

D_ASSERT(lcache != NULL && key != NULL && key_size > 0);
if (lcache->dlc_ops->lop_print_key)
lcache->dlc_ops->lop_print_key(key, key_size);

lookup_again:
link = d_hash_rec_find(&lcache->dlc_htable, key, key_size);
if (link != NULL) {
llink = link2llink(link);
Expand Down Expand Up @@ -242,6 +245,11 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
&llink->ll_link, true);
if (rc) {
lcache->dlc_ops->lop_free_ref(llink);
if (rc == -DER_EXIST && !retried) {
retried = true;
D_DEBUG(DB_TRACE, "lookup again as insert got -DER_EXIST\n");
goto lookup_again;
}
return rc;
}
lcache->dlc_count++;
Expand Down
4 changes: 3 additions & 1 deletion src/common/prop.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -334,7 +335,8 @@ daos_prop_valid(daos_prop_t *prop, bool pool, bool input)
case DAOS_PROP_PO_REINT_MODE:
val = prop->dpp_entries[i].dpe_val;
if (val != DAOS_REINT_MODE_DATA_SYNC &&
val != DAOS_REINT_MODE_NO_DATA_SYNC) {
val != DAOS_REINT_MODE_NO_DATA_SYNC &&
val != DAOS_REINT_MODE_INCREMENTAL) {
D_ERROR("invalid reintegration mode "DF_U64".\n", val);
return false;
}
Expand Down
88 changes: 45 additions & 43 deletions src/container/container_iv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -191,8 +192,9 @@ cont_iv_ent_copy(struct ds_iv_entry *entry, struct cont_iv_key *key,
cip_acl.dal_ace[src->iv_prop.cip_acl.dal_len]);
memcpy(&dst->iv_prop, &src->iv_prop, size);
break;
case IV_CONT_AGG_EPOCH_BOUNDRY:
dst->iv_agg_eph.eph = src->iv_agg_eph.eph;
case IV_CONT_TRACK_EPOCH:
dst->iv_track_eph.ite_ec_agg_eph = src->iv_track_eph.ite_ec_agg_eph;
dst->iv_track_eph.ite_stable_eph = src->iv_track_eph.ite_stable_eph;
break;
default:
rc = -DER_INVAL;
Expand Down Expand Up @@ -551,9 +553,9 @@ cont_iv_ent_fetch(struct ds_iv_entry *entry, struct ds_iv_key *key,
return rc;
}

/* Update the EC agg epoch all servers to the leader */
/* Update the track epoch all servers to the leader */
static int
cont_iv_ent_agg_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key,
cont_iv_ent_track_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key,
d_sg_list_t *src)
{
struct cont_iv_key *civ_key = key2priv(key);
Expand All @@ -568,25 +570,25 @@ cont_iv_ent_agg_eph_update(struct ds_iv_entry *entry, struct ds_iv_key *key,
if (rank != entry->ns->iv_master_rank)
return -DER_IVCB_FORWARD;

rc = ds_cont_leader_update_agg_eph(entry->ns->iv_pool_uuid,
civ_key->cont_uuid,
civ_ent->iv_agg_eph.rank,
civ_ent->iv_agg_eph.eph);
rc = ds_cont_leader_update_track_eph(entry->ns->iv_pool_uuid, civ_key->cont_uuid,
civ_ent->iv_track_eph.ite_rank,
civ_ent->iv_track_eph.ite_ec_agg_eph,
civ_ent->iv_track_eph.ite_stable_eph);
return rc;
}

/* Each server refresh the VOS aggregation epoch gotten from the leader */
/* Each server refresh the track epoch gotten from the leader */
static int
cont_iv_ent_agg_eph_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
cont_iv_ent_track_eph_refresh(struct ds_iv_entry *entry, struct ds_iv_key *key,
d_sg_list_t *src)
{
struct cont_iv_entry *civ_ent = src->sg_iovs[0].iov_buf;
struct cont_iv_key *civ_key = key2priv(key);
int rc;

rc = ds_cont_tgt_refresh_agg_eph(entry->ns->iv_pool_uuid,
civ_key->cont_uuid,
civ_ent->iv_agg_eph.eph);
rc = ds_cont_tgt_refresh_track_eph(entry->ns->iv_pool_uuid, civ_key->cont_uuid,
civ_ent->iv_track_eph.ite_ec_agg_eph,
civ_ent->iv_track_eph.ite_stable_eph);
return rc;
}

Expand Down Expand Up @@ -638,13 +640,12 @@ cont_iv_ent_update(struct ds_iv_entry *entry, struct ds_iv_key *key,
if (rc)
D_GOTO(out, rc);
} else if (entry->iv_class->iv_class_id ==
IV_CONT_AGG_EPOCH_REPORT) {
rc = cont_iv_ent_agg_eph_update(entry, key, src);
IV_CONT_TRACK_EPOCH_REPORT) {
rc = cont_iv_ent_track_eph_update(entry, key, src);
if (rc)
D_GOTO(out, rc);
} else if (entry->iv_class->iv_class_id ==
IV_CONT_AGG_EPOCH_BOUNDRY) {
rc = cont_iv_ent_agg_eph_refresh(entry, key, src);
} else if (entry->iv_class->iv_class_id == IV_CONT_TRACK_EPOCH) {
rc = cont_iv_ent_track_eph_refresh(entry, key, src);
if (rc)
D_GOTO(out, rc);
}
Expand Down Expand Up @@ -1068,20 +1069,20 @@ cont_iv_hdl_fetch(uuid_t cont_hdl_uuid, uuid_t pool_uuid,
return rc;
}

int
cont_iv_ec_agg_eph_update_internal(void *ns, uuid_t cont_uuid,
daos_epoch_t eph, unsigned int shortcut,
unsigned int sync_mode,
uint32_t op)
static int
cont_iv_track_eph_update_internal(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph,
daos_epoch_t stable_eph, unsigned int shortcut,
unsigned int sync_mode, uint32_t op)
{
struct cont_iv_entry iv_entry = { 0 };
int rc;

/* Only happens on xstream 0 */
D_ASSERT(dss_get_module_info()->dmi_xs_id == 0);
iv_entry.iv_agg_eph.eph = eph;
iv_entry.iv_track_eph.ite_ec_agg_eph = ec_agg_eph;
iv_entry.iv_track_eph.ite_stable_eph = stable_eph;
uuid_copy(iv_entry.cont_uuid, cont_uuid);
rc = crt_group_rank(NULL, &iv_entry.iv_agg_eph.rank);
rc = crt_group_rank(NULL, &iv_entry.iv_track_eph.ite_rank);
if (rc) {
D_ERROR(DF_UUID" op %d, crt_group_rank failed "DF_RC"\n",
DP_UUID(cont_uuid), op, DP_RC(rc));
Expand All @@ -1097,20 +1098,22 @@ cont_iv_ec_agg_eph_update_internal(void *ns, uuid_t cont_uuid,
}

int
cont_iv_ec_agg_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t eph)
cont_iv_track_eph_update(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph,
daos_epoch_t stable_eph)
{
return cont_iv_ec_agg_eph_update_internal(ns, cont_uuid, eph,
CRT_IV_SHORTCUT_TO_ROOT,
CRT_IV_SYNC_NONE,
IV_CONT_AGG_EPOCH_REPORT);
return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph,
CRT_IV_SHORTCUT_TO_ROOT,
CRT_IV_SYNC_NONE,
IV_CONT_TRACK_EPOCH_REPORT);
}

int
cont_iv_ec_agg_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t eph)
cont_iv_track_eph_refresh(void *ns, uuid_t cont_uuid, daos_epoch_t ec_agg_eph,
daos_epoch_t stable_eph)
{
return cont_iv_ec_agg_eph_update_internal(ns, cont_uuid, eph,
0, CRT_IV_SYNC_LAZY,
IV_CONT_AGG_EPOCH_BOUNDRY);
return cont_iv_track_eph_update_internal(ns, cont_uuid, ec_agg_eph, stable_eph,
0, CRT_IV_SYNC_LAZY,
IV_CONT_TRACK_EPOCH);
}

int
Expand All @@ -1123,14 +1126,14 @@ ds_cont_fetch_ec_agg_boundary(void *ns, uuid_t cont_uuid)
/* Only happens on xstream 0 */
D_ASSERT(dss_get_module_info()->dmi_xs_id == 0);
uuid_copy(iv_entry.cont_uuid, cont_uuid);
rc = crt_group_rank(NULL, &iv_entry.iv_agg_eph.rank);
rc = crt_group_rank(NULL, &iv_entry.iv_track_eph.ite_rank);
if (rc) {
D_ERROR(DF_UUID" crt_group_rank failed "DF_RC"\n",
DP_UUID(cont_uuid), DP_RC(rc));
return rc;
}

rc = cont_iv_fetch(ns, IV_CONT_AGG_EPOCH_BOUNDRY, cont_uuid, &iv_entry,
rc = cont_iv_fetch(ns, IV_CONT_TRACK_EPOCH, cont_uuid, &iv_entry,
sizeof(struct cont_iv_entry), sizeof(struct cont_iv_entry),
true);
if (rc)
Expand Down Expand Up @@ -1200,11 +1203,11 @@ cont_iv_entry_delete(void *ns, uuid_t pool_uuid, uuid_t cont_uuid)
if (rc != 0)
D_DEBUG(DB_MD, "delete prop "DF_UUID"\n", DP_UUID(cont_uuid));

rc = cont_iv_invalidate(ns, IV_CONT_AGG_EPOCH_REPORT, cont_uuid, CRT_IV_SYNC_NONE);
rc = cont_iv_invalidate(ns, IV_CONT_TRACK_EPOCH_REPORT, cont_uuid, CRT_IV_SYNC_NONE);
if (rc != 0)
D_DEBUG(DB_MD, "delete agg epoch report "DF_UUID"\n", DP_UUID(cont_uuid));

rc = cont_iv_invalidate(ns, IV_CONT_AGG_EPOCH_BOUNDRY, cont_uuid, CRT_IV_SYNC_NONE);
rc = cont_iv_invalidate(ns, IV_CONT_TRACK_EPOCH, cont_uuid, CRT_IV_SYNC_NONE);
if (rc != 0)
D_DEBUG(DB_MD, "delete agg epoch boundary "DF_UUID"\n", DP_UUID(cont_uuid));

Expand Down Expand Up @@ -1665,8 +1668,8 @@ ds_cont_iv_fini(void)
ds_iv_class_unregister(IV_CONT_SNAP);
ds_iv_class_unregister(IV_CONT_CAPA);
ds_iv_class_unregister(IV_CONT_PROP);
ds_iv_class_unregister(IV_CONT_AGG_EPOCH_REPORT);
ds_iv_class_unregister(IV_CONT_AGG_EPOCH_BOUNDRY);
ds_iv_class_unregister(IV_CONT_TRACK_EPOCH_REPORT);
ds_iv_class_unregister(IV_CONT_TRACK_EPOCH);
return 0;
}

Expand All @@ -1687,13 +1690,12 @@ ds_cont_iv_init(void)
if (rc)
D_GOTO(out, rc);

rc = ds_iv_class_register(IV_CONT_AGG_EPOCH_REPORT, &iv_cache_ops,
rc = ds_iv_class_register(IV_CONT_TRACK_EPOCH_REPORT, &iv_cache_ops,
&cont_iv_ops);
if (rc)
D_GOTO(out, rc);

rc = ds_iv_class_register(IV_CONT_AGG_EPOCH_BOUNDRY, &iv_cache_ops,
&cont_iv_ops);
rc = ds_iv_class_register(IV_CONT_TRACK_EPOCH, &iv_cache_ops, &cont_iv_ops);
if (rc)
D_GOTO(out, rc);
out:
Expand Down
Loading

0 comments on commit a3d4d2f

Please sign in to comment.