From c9065698a9b9ee159d5204f56cbad75529293e4f Mon Sep 17 00:00:00 2001 From: jasonz-dfinity <133917836+jasonz-dfinity@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:40:31 -0800 Subject: [PATCH] feat(nns): Move inactive neurons in the heap to stable storage through a timer (#2643) # Why This change has 2 purposes: * While the target neuron location criteria doesn't change (inactive neuron -> stable), some neurons can become inactive due to the passage of time, this change migrates those neurons to stable storage in a timer * In the future, when the stable storage for active neurons is enabled, this change will also move the active neurons to stable storage in a similar fashion # What * Removing a state machine test regarding neuron indexes. It is not useful anymore because the migration is over. * Adding a similar state machine test for the current use case * Adding the capability for moving a batch of neurons from heap memory to stable storage * Schedule a timer to execute the migration code --- rs/nns/governance/canister/canister.rs | 32 ++- rs/nns/governance/src/governance.rs | 5 + rs/nns/governance/src/neuron_store.rs | 88 ++++++++- .../src/neuron_store/neuron_store_tests.rs | 105 ++++++++++ .../src/governance_migrations.rs | 184 +++++++----------- .../src/governance_neurons.rs | 9 +- rs/nns/test_utils/src/state_test_helpers.rs | 5 +- 7 files changed, 296 insertions(+), 132 deletions(-) diff --git a/rs/nns/governance/canister/canister.rs b/rs/nns/governance/canister/canister.rs index b96288a3165..a37ab0c70f7 100644 --- a/rs/nns/governance/canister/canister.rs +++ b/rs/nns/governance/canister/canister.rs @@ -158,6 +158,11 @@ fn set_governance(gov: Governance) { .expect("Error initializing the governance canister."); } +fn schedule_timers() { + schedule_seeding(Duration::from_nanos(0)); + schedule_adjust_neurons_storage(Duration::from_nanos(0), NeuronIdProto { id: 0 }); +} + // Seeding interval seeks to find a balance between the need for rng secrecy, and // avoiding the overhead of frequent reseeding. const SEEDING_INTERVAL: Duration = Duration::from_secs(3600); @@ -188,6 +193,29 @@ fn schedule_seeding(duration: Duration) { }); } +// The interval before adjusting neuron storage for the next batch of neurons starting from last +// neuron id scanned in the last batch. +const ADJUST_NEURON_STORAGE_BATCH_INTERVAL: Duration = Duration::from_secs(5); +// The interval before adjusting neuron storage for the next round starting from the smallest neuron +// id. +const ADJUST_NEURON_STORAGE_ROUND_INTERVAL: Duration = Duration::from_secs(3600); + +fn schedule_adjust_neurons_storage(delay: Duration, start_neuron_id: NeuronIdProto) { + ic_cdk_timers::set_timer(delay, move || { + let next_neuron_id = governance_mut().batch_adjust_neurons_storage(start_neuron_id); + match next_neuron_id { + Some(next_neuron_id) => schedule_adjust_neurons_storage( + ADJUST_NEURON_STORAGE_BATCH_INTERVAL, + next_neuron_id, + ), + None => schedule_adjust_neurons_storage( + ADJUST_NEURON_STORAGE_ROUND_INTERVAL, + NeuronIdProto { id: 0 }, + ), + }; + }); +} + struct CanisterEnv { rng: Option, time_warp: GovTimeWarp, @@ -408,7 +436,7 @@ fn canister_init_(init_payload: ApiGovernanceProto) { init_payload.neurons.len() ); - schedule_seeding(Duration::from_nanos(0)); + schedule_timers(); set_governance(Governance::new( InternalGovernanceProto::from(init_payload), Box::new(CanisterEnv::new()), @@ -452,7 +480,7 @@ fn canister_post_upgrade() { restored_state.xdr_conversion_rate, ); - schedule_seeding(Duration::from_nanos(0)); + schedule_timers(); set_governance(Governance::new_restored( restored_state, Box::new(CanisterEnv::new()), diff --git a/rs/nns/governance/src/governance.rs b/rs/nns/governance/src/governance.rs index b125c0e53b4..e5cad36971b 100644 --- a/rs/nns/governance/src/governance.rs +++ b/rs/nns/governance/src/governance.rs @@ -6967,6 +6967,11 @@ impl Governance { }) } + pub fn batch_adjust_neurons_storage(&mut self, start_neuron_id: NeuronId) -> Option { + self.neuron_store + .batch_adjust_neurons_storage(start_neuron_id) + } + /// Recompute cached metrics once per day pub fn should_compute_cached_metrics(&self) -> bool { if let Some(metrics) = self.heap_data.metrics.as_ref() { diff --git a/rs/nns/governance/src/neuron_store.rs b/rs/nns/governance/src/neuron_store.rs index 4083721bb74..f2793daf096 100644 --- a/rs/nns/governance/src/neuron_store.rs +++ b/rs/nns/governance/src/neuron_store.rs @@ -206,6 +206,7 @@ pub struct NeuronsFundNeuron { pub hotkeys: Vec, } +#[derive(Eq, PartialEq)] enum StorageLocation { Heap, Stable, @@ -475,6 +476,18 @@ impl NeuronStore { heap_len + stable_len } + // Returns the target storage location of a neuron. It might not be the actual storage location + // if the neuron already exists, for 2 possible reasons: (1) the target storage location logic + // has changed, e.g. after an upgrade (2) the neuron was active, but becomes inactive due to + // passage of time. + fn target_storage_location(&self, neuron: &Neuron) -> StorageLocation { + if self.use_stable_memory_for_all_neurons || neuron.is_inactive(self.now()) { + StorageLocation::Stable + } else { + StorageLocation::Heap + } + } + /// Add a new neuron pub fn add_neuron(&mut self, neuron: Neuron) -> Result { let neuron_id = neuron.id(); @@ -485,7 +498,7 @@ impl NeuronStore { return Err(NeuronStoreError::NeuronAlreadyExists(neuron_id)); } - if self.use_stable_memory_for_all_neurons || neuron.is_inactive(self.now()) { + if self.target_storage_location(&neuron) == StorageLocation::Stable { // Write as primary copy in stable storage. with_stable_neuron_store_mut(|stable_neuron_store| { stable_neuron_store.create(neuron.clone()) @@ -566,6 +579,72 @@ impl NeuronStore { self.remove_neuron_from_indexes(&neuron_to_remove); } + /// Adjusts the storage location of neurons, since active neurons might become inactive due to + /// passage of time. + pub fn batch_adjust_neurons_storage(&mut self, start_neuron_id: NeuronId) -> Option { + static BATCH_SIZE_FOR_MOVING_NEURONS: usize = 1000; + + #[cfg(target_arch = "wasm32")] + static MAX_NUM_INSTRUCTIONS_PER_BATCH: u64 = 5_000_000_000; + + #[cfg(target_arch = "wasm32")] + let max_instructions_reached = + || ic_cdk::api::instruction_counter() >= MAX_NUM_INSTRUCTIONS_PER_BATCH; + + #[cfg(not(target_arch = "wasm32"))] + let max_instructions_reached = || false; + + self.adjust_neuron_storage_with_max_instructions( + start_neuron_id, + BATCH_SIZE_FOR_MOVING_NEURONS, + max_instructions_reached, + ) + } + + fn adjust_neuron_storage_with_max_instructions( + &mut self, + start_neuron_id: NeuronId, + max_batch_size: usize, + max_instructions_reached: impl Fn() -> bool, + ) -> Option { + // We currently only move neurons from heap to stable storage, since it's impossible to have + // active neurons in stable storage. In the future, we might need to move neurons from + // stable storage to heap as a rollback mechanism, but it is not implemented here yet. + let neuron_ids: Vec<_> = self + .heap_neurons + .range(start_neuron_id.id..) + .take(max_batch_size) + .map(|(id, _)| NeuronId { id: *id }) + .collect(); + // We know it is the last batch if the number of neurons is less than the batch size. + let is_last_batch = neuron_ids.len() < max_batch_size; + + if neuron_ids.is_empty() { + return None; + } + + let mut next_neuron_id = Some(start_neuron_id); + + for neuron_id in neuron_ids { + if max_instructions_reached() { + // We don't need to look at the `is_last_batch` because at least one neuron is + // skipped due to instruction limit. + return next_neuron_id; + } + + // We don't modify the neuron, but the below just makes sure that the neuron is in the + // appropriate storage location given its state and the current time. + let _ = self.with_neuron_mut(&neuron_id, |_| {}); + next_neuron_id = neuron_id.next(); + } + + if is_last_batch { + None + } else { + next_neuron_id + } + } + fn remove_neuron_from_indexes(&mut self, neuron: &Neuron) { let neuron_id = neuron.id(); if let Err(error) = with_stable_neuron_indexes_mut(|indexes| indexes.remove_neuron(neuron)) @@ -653,12 +732,7 @@ impl NeuronStore { new_neuron: Neuron, previous_location: StorageLocation, ) -> Result<(), NeuronStoreError> { - let target_location = - if self.use_stable_memory_for_all_neurons || new_neuron.is_inactive(self.now()) { - StorageLocation::Stable - } else { - StorageLocation::Heap - }; + let target_location = self.target_storage_location(&new_neuron); let is_neuron_changed = *old_neuron != new_neuron; self.validate_neuron(&new_neuron)?; diff --git a/rs/nns/governance/src/neuron_store/neuron_store_tests.rs b/rs/nns/governance/src/neuron_store/neuron_store_tests.rs index b63785b9b40..ca7884d5c2c 100644 --- a/rs/nns/governance/src/neuron_store/neuron_store_tests.rs +++ b/rs/nns/governance/src/neuron_store/neuron_store_tests.rs @@ -10,6 +10,7 @@ use ic_nns_constants::GOVERNANCE_CANISTER_ID; use maplit::{btreemap, hashmap, hashset}; use num_traits::bounds::LowerBounded; use pretty_assertions::assert_eq; +use std::cell::Cell; static CREATED_TIMESTAMP_SECONDS: u64 = 123_456_789; @@ -695,6 +696,110 @@ fn test_get_non_empty_neuron_ids_readable_by_caller() { ); } +#[test] +fn test_batch_adjust_neurons_storage() { + // This test doesn't make sense after neurons are migrated completely to stable memory. + let _f = temporarily_disable_active_neurons_in_stable_memory(); + + // Step 1.1: set up an empty neuron store. + let mut neuron_store = NeuronStore::new(BTreeMap::new()); + + // Step 1.2: set up 5 active neurons with stake + for i in 1..=5 { + let neuron = simple_neuron_builder(i) + .with_cached_neuron_stake_e8s(1) + .with_dissolve_state_and_age(DissolveStateAndAge::DissolvingOrDissolved { + when_dissolved_timestamp_seconds: neuron_store.now(), + }) + .build(); + neuron_store.add_neuron(neuron).unwrap(); + } + + // Step 1.3: set up 5 active neurons without stake, which will become inactive when the time is + // advanced. + for i in 6..=10 { + let neuron = active_neuron_builder(i, neuron_store.now()).build(); + neuron_store.add_neuron(neuron).unwrap(); + } + + // Step 1.4: warp time so that the neuron becomes inactive without modification. + warp_time_to_make_neuron_inactive(&mut neuron_store); + + // Step 1.5: define a lambda which always returns false, for checking instructions. + let always_false = || false; + + // Step 1.6: make sure the counts of neurons in heap and stable are expected. + assert_eq!(neuron_store.heap_neuron_store_len(), 10); + assert_eq!(neuron_store.stable_neuron_store_len(), 0); + + // Step 2: adjust the storage of neurons for the first 6 neurons and verifies the counts. Since + // the first 5 neurons are active because of their stake, only 1 neuron is moved. + let next_neuron_id = neuron_store.adjust_neuron_storage_with_max_instructions( + NeuronId { id: 0 }, + 6, + always_false, + ); + assert_eq!(next_neuron_id, Some(NeuronId { id: 7 })); + assert_eq!(neuron_store.heap_neuron_store_len(), 9); + assert_eq!(neuron_store.stable_neuron_store_len(), 1); + + // Step 3: adjust the storage of neurons for the rest of 4 neurons and verifies the counts. + let next_neuron_id = neuron_store.adjust_neuron_storage_with_max_instructions( + NeuronId { id: 7 }, + 6, + always_false, + ); + assert_eq!(next_neuron_id, None); + assert_eq!(neuron_store.heap_neuron_store_len(), 5); + assert_eq!(neuron_store.stable_neuron_store_len(), 5); +} + +#[test] +fn test_batch_adjust_neurons_storage_exceeds_instructions_limit() { + // This test doesn't make sense after neurons are migrated completely to stable memory. + let _f = temporarily_disable_active_neurons_in_stable_memory(); + + // Step 1.1: set up an empty neuron store. + let mut neuron_store = NeuronStore::new(BTreeMap::new()); + + // Step 1.2: set up 5 active neurons without stake, which will become inactive when the time is + // advanced. + for i in 1..=5 { + let neuron = active_neuron_builder(i, neuron_store.now()).build(); + neuron_store.add_neuron(neuron).unwrap(); + } + + // Step 1.4: warp time so that the neuron becomes inactive without modification. + warp_time_to_make_neuron_inactive(&mut neuron_store); + + // Step 1.5: make sure the counts of neurons in heap and stable are expected. + assert_eq!(neuron_store.heap_neuron_store_len(), 5); + assert_eq!(neuron_store.stable_neuron_store_len(), 0); + + // Step 2: adjust the storage of neurons for the first 10 neurons, however, the instruction + // limit checker returns true for the 4th time it's called, allowing moving only 3 neurons. + let counter = Cell::new(0); + let next_neuron_id = + neuron_store.adjust_neuron_storage_with_max_instructions(NeuronId { id: 0 }, 10, || { + counter.set(counter.get() + 1); + counter.get() > 3 + }); + assert_eq!(next_neuron_id, Some(NeuronId { id: 4 })); + assert_eq!(neuron_store.heap_neuron_store_len(), 2); + assert_eq!(neuron_store.stable_neuron_store_len(), 3); + + // Step 3: adjust the storage of neurons for the rest of 4 neurons and verifies the counts. + let counter = Cell::new(0); + let next_neuron_id = + neuron_store.adjust_neuron_storage_with_max_instructions(NeuronId { id: 4 }, 10, || { + counter.set(counter.get() + 1); + counter.get() > 3 + }); + assert_eq!(next_neuron_id, None); + assert_eq!(neuron_store.heap_neuron_store_len(), 0); + assert_eq!(neuron_store.stable_neuron_store_len(), 5); +} + #[test] fn test_get_full_neuron() { let principal_id = PrincipalId::new_user_test_id(42); diff --git a/rs/nns/integration_tests/src/governance_migrations.rs b/rs/nns/integration_tests/src/governance_migrations.rs index ea0b28f1fbc..5aa94978b00 100644 --- a/rs/nns/integration_tests/src/governance_migrations.rs +++ b/rs/nns/integration_tests/src/governance_migrations.rs @@ -1,50 +1,23 @@ use assert_matches::assert_matches; use candid::{Decode, Encode}; +use ic_base_types::PrincipalId; use ic_canisters_http_types::{HttpRequest, HttpResponse}; use ic_nns_constants::GOVERNANCE_CANISTER_ID; -use ic_nns_governance::neuron_data_validation::NeuronDataValidationSummary; -use ic_nns_governance_api::pb::v1::{ - manage_neuron_response::{Command, FollowResponse, SplitResponse}, - Topic, -}; +use ic_nns_governance_api::pb::v1::manage_neuron_response::Command; use ic_nns_test_utils::{ common::NnsInitPayloadsBuilder, - neuron_helpers::{get_neuron_1, get_neuron_2, get_neuron_3}, state_test_helpers::{ - nns_set_followees_for_neuron, nns_split_neuron, query, setup_nns_canisters, + nns_claim_or_refresh_neuron, nns_disburse_neuron, nns_send_icp_to_claim_or_refresh_neuron, + nns_start_dissolving, query, setup_nns_canisters_with_features, state_machine_builder_for_nns_tests, }, }; use ic_state_machine_tests::StateMachine; +use icp_ledger::{AccountIdentifier, Tokens}; use serde_bytes::ByteBuf; use std::time::Duration; -fn assert_no_validation_issues(state_machine: &StateMachine) { - let response_bytes = query( - state_machine, - GOVERNANCE_CANISTER_ID, - "get_neuron_data_validation_summary", - Encode!(&{}).unwrap(), - ) - .unwrap(); - let summary = Decode!(&response_bytes, NeuronDataValidationSummary).unwrap(); - assert_eq!(summary.current_validation_started_time_seconds, None); - let current_issues_summary = summary.current_issues_summary.unwrap(); - assert_eq!(current_issues_summary.issue_groups, vec![]); -} - -struct NeuronIndexesLens { - subaccount: u64, - principal: u64, - following: u64, - known_neuron: u64, - account_id: u64, -} - -fn assert_neuron_indexes_lens( - state_machine: &StateMachine, - neuron_indexes_lens: NeuronIndexesLens, -) { +fn assert_metric(state_machine: &StateMachine, name: &str, value: u64) { let response_bytes = query( state_machine, GOVERNANCE_CANISTER_ID, @@ -61,108 +34,81 @@ fn assert_neuron_indexes_lens( let response: HttpResponse = Decode!(&response_bytes, HttpResponse).unwrap(); let response_body = String::from_utf8(response.body.into_vec()).unwrap(); - assert!(response_body.contains(&format!( - "governance_subaccount_index_len {} ", - neuron_indexes_lens.subaccount - ))); - assert!(response_body.contains(&format!( - "governance_principal_index_len {} ", - neuron_indexes_lens.principal - ))); - assert!(response_body.contains(&format!( - "governance_following_index_len {} ", - neuron_indexes_lens.following - ))); - assert!(response_body.contains(&format!( - "governance_known_neuron_index_len {} ", - neuron_indexes_lens.known_neuron - ))); - assert!(response_body.contains(&format!( - "governance_account_id_index_len {} ", - neuron_indexes_lens.account_id - ))); + let line = response_body + .lines() + .filter(|line| line.starts_with(name)) + .collect::>() + .first() + .unwrap() + .to_string(); + assert!( + line.starts_with(&format!("{} {} ", name, value)), + "{}", + line + ); } #[test] -fn test_neuron_indexes_migrations() { +fn test_neuron_migration_from_heap_to_stable() { let state_machine = state_machine_builder_for_nns_tests().build(); - let nns_init_payloads = NnsInitPayloadsBuilder::new().with_test_neurons().build(); - setup_nns_canisters(&state_machine, nns_init_payloads); + let test_user_principal = PrincipalId::new_self_authenticating(&[1]); + let nonce = 123_456; + let nns_init_payloads = NnsInitPayloadsBuilder::new() + .with_ledger_account( + AccountIdentifier::new(test_user_principal, None), + Tokens::from_e8s(2_000_000_000), + ) + .build(); + // Make sure the test feature is not enabled. Otherwise new neurons will be created in the + // stable memory, which makes the test precondition wrong. + setup_nns_canisters_with_features(&state_machine, nns_init_payloads, &[]); + nns_send_icp_to_claim_or_refresh_neuron( + &state_machine, + test_user_principal, + Tokens::from_e8s(1_000_000_000), + nonce, + ); + let neuron_id = nns_claim_or_refresh_neuron(&state_machine, test_user_principal, nonce); - // Let heartbeat run and validation progress. + // Let heartbeat/timer run. for _ in 0..20 { state_machine.tick(); } - assert_neuron_indexes_lens( + // Make sure that the neuron is in the heap memory and active. + assert_metric( &state_machine, - NeuronIndexesLens { - subaccount: 3, - principal: 3, - following: 0, - known_neuron: 0, - account_id: 3, - }, + "governance_garbage_collectable_neurons_count", + 0, ); - assert_no_validation_issues(&state_machine); + assert_metric(&state_machine, "governance_heap_neuron_count", 1); + assert_metric(&state_machine, "governance_stable_memory_neuron_count", 0); - let neuron_1 = get_neuron_1(); - let neuron_2 = get_neuron_2(); - let neuron_3 = get_neuron_3(); + // Advance time and disburse the neuron so that it's empty. + nns_start_dissolving(&state_machine, test_user_principal, neuron_id).unwrap(); + let time_to_dissolve = Duration::from_secs(60 * 60 * 24 * 7); + state_machine.advance_time(time_to_dissolve); + let disburse_response = + nns_disburse_neuron(&state_machine, test_user_principal, neuron_id, None, None); + assert_matches!(disburse_response.command, Some(Command::Disburse(_))); - // Follow will cause the neuron to be modified. - let follow_response = nns_set_followees_for_neuron( - &state_machine, - neuron_3.principal_id, - neuron_3.neuron_id, - &[neuron_1.neuron_id, neuron_2.neuron_id], - Topic::Governance as i32, - ) - .command - .expect("Manage neuron command failed"); - assert_eq!(follow_response, Command::Follow(FollowResponse {})); - - assert_neuron_indexes_lens( - &state_machine, - NeuronIndexesLens { - subaccount: 3, - principal: 3, - following: 2, - known_neuron: 0, - account_id: 3, - }, - ); - - // Split will cause a neuron to be created. - let split_response = nns_split_neuron( - &state_machine, - neuron_1.principal_id, - neuron_1.neuron_id, - 500_000_000, - ) - .command - .expect("Manage neuron command failed"); - assert_matches!(split_response, Command::Split(SplitResponse { .. })); + // After 14 days the neuron will become inactive. Advance enough time for that. + let time_to_become_inactive = Duration::from_secs(60 * 60 * 24 * 20); + state_machine.advance_time(time_to_become_inactive); - assert_neuron_indexes_lens( - &state_machine, - NeuronIndexesLens { - subaccount: 4, - principal: 4, - following: 2, - known_neuron: 0, - account_id: 4, - }, - ); - - // Advance time so the validation can rerun. - let two_days = Duration::from_secs(60 * 60 * 24 * 2); - state_machine.advance_time(two_days); - - // Let heartbeat run and validation progress again. + // Let timer run. for _ in 0..20 { + state_machine.advance_time(Duration::from_secs(60 * 60)); state_machine.tick(); } - assert_no_validation_issues(&state_machine); + // After the timer runs, the inactive (garbage collectable) neuron should be moved to the stable + // memory. + assert_metric( + &state_machine, + "governance_garbage_collectable_neurons_count", + 1, + ); + assert_metric(&state_machine, "governance_heap_neuron_count", 0); + assert_metric(&state_machine, "governance_stable_memory_neuron_count", 1); } diff --git a/rs/nns/integration_tests/src/governance_neurons.rs b/rs/nns/integration_tests/src/governance_neurons.rs index 461b2cb81b5..6d6fb4cbc7d 100644 --- a/rs/nns/integration_tests/src/governance_neurons.rs +++ b/rs/nns/integration_tests/src/governance_neurons.rs @@ -527,8 +527,13 @@ fn test_list_neurons() { .expect("Failed to start dissolving neuron"); state_machine.advance_time(Duration::from_secs(INITIAL_NEURON_DISSOLVE_DELAY + 1)); state_machine.tick(); - let disburse_result = - nns_disburse_neuron(&state_machine, principal_1, neuron_id_2, 200_000_000, None); + let disburse_result = nns_disburse_neuron( + &state_machine, + principal_1, + neuron_id_2, + Some(200_000_000), + None, + ); match disburse_result { ManageNeuronResponse { diff --git a/rs/nns/test_utils/src/state_test_helpers.rs b/rs/nns/test_utils/src/state_test_helpers.rs index f5483d8797a..599e05bbdfe 100644 --- a/rs/nns/test_utils/src/state_test_helpers.rs +++ b/rs/nns/test_utils/src/state_test_helpers.rs @@ -989,7 +989,7 @@ pub fn nns_disburse_neuron( state_machine: &StateMachine, sender: PrincipalId, neuron_id: NeuronId, - amount_e8s: u64, + amount_e8s: Option, to_account: Option, ) -> ManageNeuronResponse { manage_neuron( @@ -997,7 +997,8 @@ pub fn nns_disburse_neuron( sender, neuron_id, ManageNeuronCommandRequest::Disburse(Disburse { - amount: Some(manage_neuron::disburse::Amount { e8s: amount_e8s }), + amount: amount_e8s + .map(|amount_e8s| manage_neuron::disburse::Amount { e8s: amount_e8s }), to_account: to_account.map(|account| account.into()), }), )