From fb444f295e887c018c3171b1ef5e88a1a61477ef Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 23 Jan 2025 15:20:14 -0800 Subject: [PATCH] DAOS-16464 test: improve online_rebuild_mdtest.py (#15108) - Run with a stonewall and stop ranks after half of the stonewall time so the timing is more reliable than arbitrarily sleeping for 30 seconds. - Catch exceptions raised in the mdtest thread. - Reduce logging. - Misc refactoring improvements Signed-off-by: Dalton Bohning Signed-off-by: Padmanabhan --- .../erasurecode/online_rebuild_mdtest.py | 21 ++----- .../erasurecode/online_rebuild_mdtest.yaml | 47 ++++++++------- src/tests/ftest/util/ec_utils.py | 60 +++++++++---------- 3 files changed, 60 insertions(+), 68 deletions(-) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py index 8f320a077d4..c7e18c44e87 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.py +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.py @@ -1,5 +1,6 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -14,11 +15,6 @@ class EcodOnlineRebuildMdtest(ErasureCodeMdtest): :avocado: recursive """ - def __init__(self, *args, **kwargs): - """Initialize a EcOnlineRebuild object.""" - super().__init__(*args, **kwargs) - self.set_online_rebuild = True - def test_ec_online_rebuild_mdtest(self): """Jira ID: DAOS-7320. @@ -35,13 +31,6 @@ def test_ec_online_rebuild_mdtest(self): :avocado: tags=ec,ec_array,mdtest,ec_online_rebuild :avocado: tags=EcodOnlineRebuildMdtest,test_ec_online_rebuild_mdtest """ - # Kill last server rank - self.rank_to_kill = self.server_count - 1 - - # Run only object type which matches the server count and - # remove other objects - for oclass in self.obj_class: - if oclass[1] == self.server_count: - self.obj_class = oclass[0] - - self.start_online_mdtest() + # Stop one random rank while mdtest is running + ranks_to_stop = self.random.sample(list(self.server_managers[0].ranks), k=1) + self.start_online_mdtest(ranks_to_stop) diff --git a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml index 83c2831f4f0..13c652f96a8 100644 --- a/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml +++ b/src/tests/ftest/erasurecode/online_rebuild_mdtest.yaml @@ -7,7 +7,7 @@ hosts: 12_server: test_servers: server-[1-6] test_clients: 2 -timeout: 1000 +timeout: 1500 setup: start_agents_once: False start_servers_once: False @@ -21,6 +21,7 @@ server_config: fabric_iface: ib0 fabric_iface_port: 31416 log_file: daos_server0.log + log_mask: ERR storage: auto 1: pinned_numa_node: 1 @@ -28,6 +29,7 @@ server_config: fabric_iface: ib1 fabric_iface_port: 31517 log_file: daos_server1.log + log_mask: ERR storage: auto pool: size: 93% @@ -37,26 +39,29 @@ container: properties: rd_fac:2 mdtest: client_processes: - np_48: - np: 48 - num_of_files_dirs: 200 - mdtest_api: - dfs: - api: 'DFS' - test_dir: "/" - iteration: 4 + np: 4 + api: DFS + test_dir: / dfs_destroy: True - manager: "MPICH" - flags: "-u" - write_bytes: 4194304 - read_bytes: 4194304 + manager: MPICH + flags: "-u -F -C" + write_bytes: 524288 + read_bytes: 524288 depth: 10 + num_of_files_dirs: 10000000 + stonewall_timer: 30 # EC does not supported for directory so for now running with RP - dfs_dir_oclass: "RP_3G1" - objectclass: - dfs_oclass_list: - #- [EC_Object_Class, Exact number of servers] - - ["EC_2P2GX", 6] - - ["EC_4P2GX", 8] - - ["EC_4P3GX", 12] - - ["EC_8P2GX", 12] + dfs_dir_oclass: RP_3G1 + dfs_oclass_mux: !mux + 6_server_ec2p2gx: + !filter-only : "/run/hosts/servers/6_server" # yamllint disable-line rule:colons + dfs_oclass: EC_2P2GX + 8_server_ec4p2gx: + !filter-only : "/run/hosts/servers/8_server" # yamllint disable-line rule:colons + dfs_oclass: EC_4P2GX + 12_server_ec4p3gx: + !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons + dfs_oclass: EC_4P3GX + 12_server_ec8p2gx: + !filter-only : "/run/hosts/servers/12_server" # yamllint disable-line rule:colons + dfs_oclass: EC_8P2GX diff --git a/src/tests/ftest/util/ec_utils.py b/src/tests/ftest/util/ec_utils.py index 54ccda3b9aa..87469de9e72 100644 --- a/src/tests/ftest/util/ec_utils.py +++ b/src/tests/ftest/util/ec_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -413,56 +414,53 @@ def start_online_single_operation(self, operation, parity=1): class ErasureCodeMdtest(MdtestBase): """Class to used for EC testing for MDtest Benchmark.""" - def __init__(self, *args, **kwargs): - """Initialize a MdtestBase object.""" - super().__init__(*args, **kwargs) - self.server_count = None - self.set_online_rebuild = False - self.rank_to_kill = None - self.obj_class = None - def setUp(self): """Set up each test case.""" super().setUp() - engine_count = self.server_managers[0].get_config_value("engines_per_host") - self.server_count = len(self.hostlist_servers) * engine_count - self.obj_class = self.params.get("dfs_oclass_list", '/run/mdtest/objectclass/*') # Create Pool self.add_pool() + self.container = None self.out_queue = queue.Queue() - def write_single_mdtest_dataset(self): - """Run MDtest with EC object type.""" - # Update the MDtest obj class - self.mdtest_cmd.dfs_oclass.update(self.obj_class) + def _start_execute_mdtest(self, mdtest_result_queue): + """Run the execute_mdtest method - # Write the MDtest data - self.execute_mdtest(self.out_queue) + Args: + mdtest_result_queue(Queue) : Queue for passing errors. + Returns: + result(object) : mdtest run result + """ + try: + result = self.execute_mdtest(mdtest_result_queue) + except Exception: # pylint: disable=broad-except + mdtest_result_queue.put('Mdtest Failed') + return result - def start_online_mdtest(self): - """Run MDtest operation with thread in background. + def start_online_mdtest(self, ranks_to_stop): + """Run mdtest and stop ranks while mdtest is running. - Trigger the server failure while MDtest is running + Args: + ranks_to_stop (list): ranks to stop while mdtest is running """ + # Create the container and check the status + self.container = self.get_mdtest_container(self.pool) # Create the MDtest run thread - job = threading.Thread(target=self.write_single_mdtest_dataset) + job = threading.Thread( + target=self._start_execute_mdtest, + kwargs={"mdtest_result_queue": self.out_queue}) # Launch the MDtest thread job.start() - # Kill the server rank while IO operation in progress - if self.set_online_rebuild: - time.sleep(30) - # Kill the server rank - if self.rank_to_kill is not None: - self.server_managers[0].stop_ranks([self.rank_to_kill], - self.d_log, - force=True) + # Stop the server ranks while IO operation in progress + time.sleep(self.mdtest_cmd.stonewall_timer.value / 2) + self.server_managers[0].stop_ranks(ranks_to_stop, self.d_log, force=True) # Wait to finish the thread job.join() # Verify the queue result and make sure test has no failure while not self.out_queue.empty(): - if self.out_queue.get() == "Mdtest Failed": - self.fail("FAIL") + result = self.out_queue.get() + if result == "Mdtest Failed": + self.fail(result)