Skip to content

Commit

Permalink
DAOS-16464 test: improve online_rebuild_mdtest.py (#15108)
Browse files Browse the repository at this point in the history
- Run with a stonewall and stop ranks after half of the stonewall time so
the timing is more reliable than arbitrarily sleeping for 30 seconds.
- Catch exceptions raised in the mdtest thread.
- Reduce logging.
- Misc refactoring improvements

Signed-off-by: Dalton Bohning <[email protected]>
Signed-off-by: Padmanabhan <[email protected]>
  • Loading branch information
daltonbohning authored Jan 23, 2025
1 parent a3d4d2f commit fb444f2
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 68 deletions.
21 changes: 5 additions & 16 deletions src/tests/ftest/erasurecode/online_rebuild_mdtest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2020-2023 Intel Corporation.
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand All @@ -14,11 +15,6 @@ class EcodOnlineRebuildMdtest(ErasureCodeMdtest):
:avocado: recursive
"""
def __init__(self, *args, **kwargs):
"""Initialize a EcOnlineRebuild object."""
super().__init__(*args, **kwargs)
self.set_online_rebuild = True

def test_ec_online_rebuild_mdtest(self):
"""Jira ID: DAOS-7320.
Expand All @@ -35,13 +31,6 @@ def test_ec_online_rebuild_mdtest(self):
:avocado: tags=ec,ec_array,mdtest,ec_online_rebuild
:avocado: tags=EcodOnlineRebuildMdtest,test_ec_online_rebuild_mdtest
"""
# Kill last server rank
self.rank_to_kill = self.server_count - 1

# Run only object type which matches the server count and
# remove other objects
for oclass in self.obj_class:
if oclass[1] == self.server_count:
self.obj_class = oclass[0]

self.start_online_mdtest()
# Stop one random rank while mdtest is running
ranks_to_stop = self.random.sample(list(self.server_managers[0].ranks), k=1)
self.start_online_mdtest(ranks_to_stop)
47 changes: 26 additions & 21 deletions src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ hosts:
12_server:
test_servers: server-[1-6]
test_clients: 2
timeout: 1000
timeout: 1500
setup:
start_agents_once: False
start_servers_once: False
Expand All @@ -21,13 +21,15 @@ server_config:
fabric_iface: ib0
fabric_iface_port: 31416
log_file: daos_server0.log
log_mask: ERR
storage: auto
1:
pinned_numa_node: 1
nr_xs_helpers: 1
fabric_iface: ib1
fabric_iface_port: 31517
log_file: daos_server1.log
log_mask: ERR
storage: auto
pool:
size: 93%
Expand All @@ -37,26 +39,29 @@ container:
properties: rd_fac:2
mdtest:
client_processes:
np_48:
np: 48
num_of_files_dirs: 200
mdtest_api:
dfs:
api: 'DFS'
test_dir: "/"
iteration: 4
np: 4
api: DFS
test_dir: /
dfs_destroy: True
manager: "MPICH"
flags: "-u"
write_bytes: 4194304
read_bytes: 4194304
manager: MPICH
flags: "-u -F -C"
write_bytes: 524288
read_bytes: 524288
depth: 10
num_of_files_dirs: 10000000
stonewall_timer: 30
# EC does not supported for directory so for now running with RP
dfs_dir_oclass: "RP_3G1"
objectclass:
dfs_oclass_list:
#- [EC_Object_Class, Exact number of servers]
- ["EC_2P2GX", 6]
- ["EC_4P2GX", 8]
- ["EC_4P3GX", 12]
- ["EC_8P2GX", 12]
dfs_dir_oclass: RP_3G1
dfs_oclass_mux: !mux
6_server_ec2p2gx:
!filter-only : "/run/hosts/servers/6_server" # yamllint disable-line rule:colons
dfs_oclass: EC_2P2GX
8_server_ec4p2gx:
!filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons
dfs_oclass: EC_4P2GX
12_server_ec4p3gx:
!filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons
dfs_oclass: EC_4P3GX
12_server_ec8p2gx:
!filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons
dfs_oclass: EC_8P2GX
60 changes: 29 additions & 31 deletions src/tests/ftest/util/ec_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
(C) Copyright 2020-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand Down Expand Up @@ -413,56 +414,53 @@ def start_online_single_operation(self, operation, parity=1):
class ErasureCodeMdtest(MdtestBase):
"""Class to used for EC testing for MDtest Benchmark."""

def __init__(self, *args, **kwargs):
"""Initialize a MdtestBase object."""
super().__init__(*args, **kwargs)
self.server_count = None
self.set_online_rebuild = False
self.rank_to_kill = None
self.obj_class = None

def setUp(self):
"""Set up each test case."""
super().setUp()
engine_count = self.server_managers[0].get_config_value("engines_per_host")
self.server_count = len(self.hostlist_servers) * engine_count
self.obj_class = self.params.get("dfs_oclass_list", '/run/mdtest/objectclass/*')
# Create Pool
self.add_pool()
self.container = None
self.out_queue = queue.Queue()

def write_single_mdtest_dataset(self):
"""Run MDtest with EC object type."""
# Update the MDtest obj class
self.mdtest_cmd.dfs_oclass.update(self.obj_class)
def _start_execute_mdtest(self, mdtest_result_queue):
"""Run the execute_mdtest method
# Write the MDtest data
self.execute_mdtest(self.out_queue)
Args:
mdtest_result_queue(Queue) : Queue for passing errors.
Returns:
result(object) : mdtest run result
"""
try:
result = self.execute_mdtest(mdtest_result_queue)
except Exception: # pylint: disable=broad-except
mdtest_result_queue.put('Mdtest Failed')
return result

def start_online_mdtest(self):
"""Run MDtest operation with thread in background.
def start_online_mdtest(self, ranks_to_stop):
"""Run mdtest and stop ranks while mdtest is running.
Trigger the server failure while MDtest is running
Args:
ranks_to_stop (list): ranks to stop while mdtest is running
"""
# Create the container and check the status
self.container = self.get_mdtest_container(self.pool)
# Create the MDtest run thread
job = threading.Thread(target=self.write_single_mdtest_dataset)
job = threading.Thread(
target=self._start_execute_mdtest,
kwargs={"mdtest_result_queue": self.out_queue})

# Launch the MDtest thread
job.start()

# Kill the server rank while IO operation in progress
if self.set_online_rebuild:
time.sleep(30)
# Kill the server rank
if self.rank_to_kill is not None:
self.server_managers[0].stop_ranks([self.rank_to_kill],
self.d_log,
force=True)
# Stop the server ranks while IO operation in progress
time.sleep(self.mdtest_cmd.stonewall_timer.value / 2)
self.server_managers[0].stop_ranks(ranks_to_stop, self.d_log, force=True)

# Wait to finish the thread
job.join()

# Verify the queue result and make sure test has no failure
while not self.out_queue.empty():
if self.out_queue.get() == "Mdtest Failed":
self.fail("FAIL")
result = self.out_queue.get()
if result == "Mdtest Failed":
self.fail(result)

0 comments on commit fb444f2

Please sign in to comment.