From 037c6794f025c6743d4ab105ad4b31808fbc50f7 Mon Sep 17 00:00:00 2001
From: David Derler <david@dfinity.org>
Date: Fri, 28 Jun 2024 11:18:45 +0000
Subject: [PATCH] test: Improve XNet compatibility tests

---
 rs/tests/src/message_routing/compatibility.rs |  71 ++++--
 rs/tests/src/message_routing/xnet_slo_test.rs | 220 ++++++++++++------
 2 files changed, 204 insertions(+), 87 deletions(-)

diff --git a/rs/tests/src/message_routing/compatibility.rs b/rs/tests/src/message_routing/compatibility.rs
index f79d7afc59e..857399d71ed 100644
--- a/rs/tests/src/message_routing/compatibility.rs
+++ b/rs/tests/src/message_routing/compatibility.rs
@@ -7,11 +7,16 @@ Runbook::
 0. Deploy 1 root and 2 app subnets and install NNS canisters onto root,
    all running mainnet version.
 1. Bless current version.
-2. Run XNet test between two app subnets (success criteria same as for SLO test).
-3. Upgrade one app subnet to current version.
-4. Run XNet test again.
-5. Downgrade back to mainnet.
-6. Run XNet test again.
+2. Deploy and start XNet test canisters for long running XNet test
+3. Run XNet test between two app subnets (success criteria same as for SLO test).
+4. Upgrade one app subnet to current version.
+5. Run XNet test again.
+6. Downgrade back to mainnet.
+7. Run XNet test again.
+8. Tear down XNet test canisters for long running XNet test and check success
+   (success conditions for the long running test are more generous, as the main
+    expected signal is that upgrade/downgrade with messages around will succeed
+    and no messages are lost)
 
 Success::
 1. XNet test successfully completes for all version combinations
@@ -34,7 +39,7 @@ use crate::orchestrator::utils::{
         UpdateImageType,
     },
 };
-use crate::util::block_on;
+use crate::util::{block_on, runtime_from_url};
 use ic_registry_subnet_type::SubnetType;
 use slog::{info, Logger};
 use std::time::Duration;
@@ -118,7 +123,30 @@ pub async fn test_async(env: TestEnv) {
         .map(|s| (s.subnet_id, s.clone(), s.nodes().next().unwrap()))
         .collect();
 
-    let xnet_test_config = xnet_slo_test::Config::new(2, 1, Duration::from_secs(30), 10);
+    let app_subnet_runtimes = app_subnets
+        .clone()
+        .into_iter()
+        .map(|(_, _, node)| node)
+        .map(|node| runtime_from_url(node.get_public_url(), node.effective_canister_id()));
+
+    let xnet_config = xnet_slo_test::Config::new(2, 1, Duration::from_secs(30), 10);
+    let long_xnet_config = xnet_slo_test::Config::new_with_custom_thresholds(
+        2,
+        1,
+        // Given that we use `deploy_and_start` and `tear_down` directly
+        // the runtime parameter will be ignored for the main test run
+        // and only used when checking the success of the test. We set
+        // it conservatively low so that the success evaluation is more
+        // generous.
+        Duration::from_secs(90),
+        10,
+        0.3,
+        // Given that there are a couple of subnet upgrades happening
+        // while the long running test is running we are generous
+        // with error thresholds.
+        50.0,
+        40,
+    );
 
     let mainnet_version = env
         .read_dependency_to_string("testnet/mainnet_nns_revision.txt")
@@ -147,12 +175,17 @@ pub async fn test_async(env: TestEnv) {
 
     info!(&logger, "Blessed all versions.");
 
+    info!(&logger, "Starting long running XNet load");
+    let runtimes = app_subnet_runtimes.clone().collect::<Vec<_>>();
+    let long_running_canisters =
+        xnet_slo_test::deploy_and_start(env.clone(), &runtimes, &long_xnet_config, &logger).await;
+
     info!(&logger, "Starting XNet test between 2 app subnets.");
 
     xnet_slo_test::test_async_impl(
         env.clone(),
-        app_subnets.clone().into_iter().map(|(_, _, node)| node),
-        xnet_test_config.clone(),
+        app_subnet_runtimes.clone(),
+        xnet_config.clone(),
         &logger,
     )
     .await;
@@ -172,8 +205,8 @@ pub async fn test_async(env: TestEnv) {
 
     xnet_slo_test::test_async_impl(
         env.clone(),
-        app_subnets.clone().into_iter().map(|(_, _, node)| node),
-        xnet_test_config.clone(),
+        app_subnet_runtimes.clone(),
+        xnet_config.clone(),
         &logger,
     )
     .await;
@@ -191,13 +224,15 @@ pub async fn test_async(env: TestEnv) {
 
     info!(&logger, "Starting XNet test between 2 app subnets.");
 
-    xnet_slo_test::test_async_impl(
-        env,
-        app_subnets.clone().into_iter().map(|(_, _, node)| node),
-        xnet_test_config,
-        &logger,
-    )
-    .await;
+    xnet_slo_test::test_async_impl(env, app_subnet_runtimes, xnet_config.clone(), &logger).await;
+
+    info!(&logger, "Tearing down long running canisters.");
+
+    let metrics = xnet_slo_test::tear_down(&long_running_canisters, &logger).await;
+    assert!(
+        xnet_slo_test::check_success(metrics, &long_xnet_config, &logger),
+        "Long running canisters didn't meet success conditions."
+    );
 }
 
 async fn upgrade_to(
diff --git a/rs/tests/src/message_routing/xnet_slo_test.rs b/rs/tests/src/message_routing/xnet_slo_test.rs
index 4442bb03f7c..da1e427b0b6 100644
--- a/rs/tests/src/message_routing/xnet_slo_test.rs
+++ b/rs/tests/src/message_routing/xnet_slo_test.rs
@@ -11,8 +11,9 @@ Runbook::
 4. Stop sending messages for all canisters (via update `stop` call).
 5. Collect metrics from all canisters (via query `metrics` call).
 6. Aggregate metrics for each subnet (over its canisters).
-7. Assert error_ratio < 5%, no seq_errors, send_rate >= 0.3, responses_received > threshold (calculated dynamically).
-8. Stop/delete all canisters and assert operations' success.
+7. Stop/delete all canisters and assert operations' success.
+8. Assert error_ratio < 5%, no seq_errors, send_rate >= 0.3, responses_received > threshold (calculated dynamically).
+
 
 Success::
 1. Xnet canisters are successfully installed and started on each subnet.
@@ -28,7 +29,7 @@ use crate::driver::ic::{InternetComputer, Subnet};
 use crate::driver::pot_dsl::{PotSetupFn, SysTestFn};
 use crate::driver::test_env::TestEnv;
 use crate::driver::test_env_api::{
-    HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, IcNodeSnapshot, NnsInstallationBuilder,
+    HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, NnsInstallationBuilder,
 };
 use crate::util::{block_on, runtime_from_url};
 use canister_test::{Canister, Runtime};
@@ -67,6 +68,26 @@ pub struct Config {
 
 impl Config {
     pub fn new(subnets: usize, nodes_per_subnet: usize, runtime: Duration, rate: usize) -> Config {
+        Self::new_with_custom_thresholds(
+            subnets,
+            nodes_per_subnet,
+            runtime,
+            rate,
+            SEND_RATE_THRESHOLD,
+            ERROR_PERCENTAGE_THRESHOLD,
+            TARGETED_LATENCY_SECONDS,
+        )
+    }
+
+    pub fn new_with_custom_thresholds(
+        subnets: usize,
+        nodes_per_subnet: usize,
+        runtime: Duration,
+        rate: usize,
+        send_rate_threshold: f64,
+        error_percentage_threshold: f64,
+        targeted_latency_seconds: u64,
+    ) -> Config {
         // Subnet-to-subnet request rate: ceil(rate / subnet_connections).
         let subnet_to_subnet_rate = (rate - 1) / (subnets - 1) + 1;
         // Minimum number of subnet-to-subnet queues needed to stay under
@@ -84,9 +105,9 @@ impl Config {
             nodes_per_subnet,
             runtime,
             payload_size_bytes: PAYLOAD_SIZE_BYTES,
-            send_rate_threshold: SEND_RATE_THRESHOLD,
-            error_percentage_threshold: ERROR_PERCENTAGE_THRESHOLD,
-            targeted_latency_seconds: TARGETED_LATENCY_SECONDS,
+            send_rate_threshold,
+            error_percentage_threshold,
+            targeted_latency_seconds,
             subnet_to_subnet_rate,
             canisters_per_subnet,
             canister_to_subnet_rate,
@@ -141,40 +162,36 @@ pub async fn test_async(env: TestEnv, config: Config) {
 
     test_async_impl(
         env,
-        topology.subnets().map(|s| s.nodes().next().unwrap()),
+        topology
+            .subnets()
+            .map(|s| s.nodes().next().unwrap())
+            .map(|node| runtime_from_url(node.get_public_url(), node.effective_canister_id())),
         config,
         &logger,
     )
     .await;
 }
 
-/// Takes as input a testing environment, a list of nodes s.t. each node is on
-/// one of the subnets to deploy XNet test canisters to, and a configuration,
-/// and runs an instance of the XNet SLO test. It assumes the IC instance under
-/// test is already set up and ignores all `config` parameters related to the
-/// IC topology (e.g., `nodes_per_subnet`).
+/// Deploys the XNet test canister to each subnet and calls the `start` function on
+/// the canister with the given `config` parameters.Takes as input a testing environment,
+/// a list of endpoint runtimes s.t. each runtime corresponds to a node on one of the
+/// subnets to deploy XNet test canisters to, a configuration, and a logger. Returns
+/// vector of vectors with handles to the deployed canisters per subnet.
 ///
 ///
 /// # Panics
-/// - If the nodes provided in `nodes` are incompatible with `config`.
-/// - On test failure.
-pub(crate) async fn test_async_impl(
+/// - If the endpoints provided in `endpoint_runtimes` are incompatible with `config`.
+/// - On failure of one of the operations.
+pub(crate) async fn deploy_and_start<'a, 'b>(
     env: TestEnv,
-    nodes: impl Iterator<Item = IcNodeSnapshot>,
-    config: Config,
-    logger: &slog::Logger,
-) {
-    // Installing canisters on a subnet requires an Agent (or a Runtime wrapper around Agent).
-    // We need only one agent (runtime) per subnet for canister installation.
-    let endpoints_runtime: Vec<Runtime> = nodes
-        .map(|node| runtime_from_url(node.get_public_url(), node.effective_canister_id()))
-        .collect();
-    assert_eq!(endpoints_runtime.len(), config.subnets);
-    // Step 1: Install Xnet canisters on each subnet.
+    endpoints_runtimes: &'a [Runtime],
+    config: &'b Config,
+    logger: &'b slog::Logger,
+) -> Vec<Vec<Canister<'a>>> {
     info!(logger, "Installing Xnet canisters on subnets ...");
     let canisters = install_canisters(
         env.clone(),
-        &endpoints_runtime,
+        endpoints_runtimes,
         config.subnets,
         config.canisters_per_subnet,
     )
@@ -188,8 +205,7 @@ pub(crate) async fn test_async_impl(
         logger,
         "All {} canisters installed successfully.", canisters_count
     );
-    // Step 2: Start all canisters (via update `start` call).
-    info!(logger, "Calling start() on all canisters...");
+
     start_all_canisters(
         &canisters,
         config.payload_size_bytes,
@@ -205,22 +221,26 @@ pub(crate) async fn test_async_impl(
         config.payload_size_bytes,
         msgs_per_round * config.payload_size_bytes as usize
     );
-    // Step 3: Wait for canisters to exchange messages.
-    info!(
-        logger,
-        "Sending messages for {} secs...",
-        config.runtime.as_secs()
-    );
 
-    tokio::time::sleep(Duration::from_secs(config.runtime.as_secs())).await;
+    canisters
+}
 
-    // Step 4: Stop all canisters (via update `stop` call).
-    info!(logger, "Stopping all canisters...");
-    stop_all_canister(&canisters).await;
-    // Step 5: Collect metrics from all canisters (via query `metrics` call).
+/// Attempts to stop and delete the canisters. Takes as input a list of canisters
+/// and a logger. It calls the `stop` endpoint on all canisters and obtains the
+/// metrics from the `metrics` endpoint of all canisters.
+///
+///
+/// # Panics
+/// - On failure of one of the operations.
+pub(crate) async fn tear_down(
+    canisters: &[Vec<Canister<'_>>],
+    logger: &slog::Logger,
+) -> Vec<Metrics> {
+    stop_all_canister(canisters).await;
+    // Collect metrics from all canisters (via query `metrics` call).
     info!(logger, "Collecting metrics from all canisters...");
-    let metrics = collect_metrics(&canisters).await;
-    // Step 6: Aggregate metrics for each subnet (over its canisters).
+    let metrics = collect_metrics(canisters).await;
+    // Aggregate metrics for each subnet (over its canisters).
     info!(logger, "Aggregating metrics for each subnet...");
     let mut aggregated_metrics = Vec::<Metrics>::new();
     for (subnet_idx, subnet_metrics) in metrics.iter().enumerate() {
@@ -243,7 +263,48 @@ pub(crate) async fn test_async_impl(
             aggregated_metrics.last()
         );
     }
-    // Step 7. Assert metric are within limits.
+
+    info!(logger, "Stop/delete all canisters...");
+    // Stop all canisters.
+    let _: Vec<_> = parallel_async(
+        canisters.iter().flatten(),
+        |canister| {
+            info!(logger, "Stopping canister {} ...", canister.canister_id());
+            canister.stop()
+        },
+        |_, res| {
+            res.expect("Stopping canister failed.");
+        },
+    )
+    .await;
+
+    // Delete all canisters.
+    let _: Vec<_> = parallel_async(
+        canisters.iter().flatten(),
+        |canister| {
+            info!(logger, "Deleting canister {} ...", canister.canister_id());
+            canister.delete()
+        },
+        |_, res| {
+            res.expect("Deleting canister failed.");
+        },
+    )
+    .await;
+
+    aggregated_metrics
+}
+
+/// Checks whether the metrics (by themselves and/or relative to `config`)
+/// indicate a successful run: error ratio and latency below threshold, send
+/// rate and received responses aoove threshold, no sequence errors. Logs the
+/// outcome of each check.
+///
+/// Returns `true` on success, `false` otherwise.
+pub(crate) fn check_success(
+    aggregated_metrics: Vec<Metrics>,
+    config: &Config,
+    logger: &slog::Logger,
+) -> bool {
     info!(logger, "Asserting metrics are within limits...");
     let mut success = true;
     let mut expect =
@@ -346,34 +407,55 @@ pub(crate) async fn test_async_impl(
             );
         }
     }
-    info!(logger, "Stop/delete all canisters...");
-    // Step 8: Stop all canisters.
-    let _: Vec<_> = parallel_async(
-        canisters.iter().flatten(),
-        |canister| {
-            info!(logger, "Stopping canister {} ...", canister.canister_id());
-            canister.stop()
-        },
-        |_, res| {
-            res.expect("Stopping canister failed.");
-        },
-    )
-    .await;
 
-    // Step 9: Delete all canisters.
-    let _: Vec<_> = parallel_async(
-        canisters.iter().flatten(),
-        |canister| {
-            info!(logger, "Deleting canister {} ...", canister.canister_id());
-            canister.delete()
-        },
-        |_, res| {
-            res.expect("Deleting canister failed.");
-        },
-    )
-    .await;
+    success
+}
 
-    assert!(success, "Test failed.");
+/// Takes as input a testing environment, a list of nodes s.t. each node is on
+/// one of the subnets to deploy XNet test canisters to, and a configuration,
+/// and runs an instance of the XNet SLO test. It assumes the IC instance under
+/// test is already set up and ignores all `config` parameters related to the
+/// IC topology (e.g., `nodes_per_subnet`).
+///
+///
+/// # Panics
+/// - If the nodes provided in `nodes` are incompatible with `config`.
+/// - On test failure.
+pub(crate) async fn test_async_impl(
+    env: TestEnv,
+    endpoints_runtimes: impl Iterator<Item = Runtime>,
+    config: Config,
+    logger: &slog::Logger,
+) {
+    // Installing canisters on a subnet requires an Agent (or a Runtime wrapper around Agent).
+    // We need only one agent (runtime) per subnet for canister installation.
+    let endpoints_runtimes = endpoints_runtimes.collect::<Vec<_>>();
+    assert_eq!(endpoints_runtimes.len(), config.subnets);
+
+    // Step 1: Install Xnet canisters on each subnet.
+    // Step 2: Start all canisters (via update `start` call).
+    let canisters = deploy_and_start(env, &endpoints_runtimes, &config, logger).await;
+
+    // Step 3: Wait for canisters to exchange messages.
+    info!(
+        logger,
+        "Sending messages for {} secs...",
+        config.runtime.as_secs()
+    );
+    tokio::time::sleep(Duration::from_secs(config.runtime.as_secs())).await;
+
+    // Step 4: Stop all canisters (via update `stop` call).
+    // Step 5: Collect metrics from all canisters (via query `metrics` call).
+    // Step 6: Aggregate metrics for each subnet (over its canisters).
+    // Step 7: Stop/delete all canisters and assert operations' success.
+    info!(logger, "Stopping all canisters...");
+    let aggregated_metrics = tear_down(&canisters, logger).await;
+
+    // Step 8. Assert metric are within limits.
+    assert!(
+        check_success(aggregated_metrics, &config, logger),
+        "Test failed."
+    );
 }
 
 pub async fn stop_all_canister(canisters: &[Vec<Canister<'_>>]) {